author = {Chandola V. and Banerjee A. and Kumar V.},
title={Anomaly detection: A survey},
title = {Anomaly Detection: A Survey},
author={Chandola, Varun and Banerjee, Arindam and Kumar, Vipin},
journal = {ACM Comput. Surv.},
journal={ACM computing surveys (CSUR)},
pages = {1-58},
volume={41},
number = {41(3)},
number={3},
year = {2009},
pages={1--58},
doi = {10.1145/1541880.1541882}
year={2009},
doi = {10.1145/1541880.1541882},
publisher={ACM New York, NY, USA}
}
}
@article{bosman2017,
@article{bosman2017,
author = {Hedde HWJ Bosman and Giovanni Iacca and Arturo Tejada and Heinrich J Wörtche and Antonio Liotta},
author = {Hedde HWJ Bosman and Giovanni Iacca and Arturo Tejada and Heinrich J Wörtche and Antonio Liotta},
@ -133,4 +135,136 @@
number = {213},
number = {213},
year = {2020},
year = {2020},
pages = {1-39}
pages = {1-39}
}
}
@inproceedings{rajasegarar2007,
title={Quarter sphere based distributed anomaly detection in wireless sensor networks},
author={Rajasegarar, Sutharshan and Leckie, Christopher and Palaniswami, Marimuthu and Bezdek, James C},
booktitle={2007 IEEE International Conference on Communications},
pages={3864--3869},
year={2007},
organization={IEEE}
}
@inproceedings{moshtaghi2011,
title={Incremental elliptical boundary estimation for anomaly detection in wireless sensor networks},
author={Moshtaghi, Masud and Leckie, Christopher and Karunasekera, Shanika and Bezdek, James C and Rajasegarar, Sutharshan and Palaniswami, Marimuthu},
booktitle={2011 IEEE 11th international conference on data mining},
pages={467--476},
year={2011},
organization={IEEE}
}
% drift
@article{ni2009,
title={Sensor network data fault types},
author={Ni, Kevin and Ramanathan, Nithya and Chehade, Mohamed Nabil Hajj and Balzano, Laura and Nair, Sheela and Zahedi, Sadaf and Kohler, Eddie and Pottie, Greg and Hansen, Mark and Srivastava, Mani},
journal={ACM Transactions on Sensor Networks (TOSN)},
volume={5},
number={3},
pages={1--29},
year={2009},
publisher={ACM New York, NY, USA}
}
@article{wu2019,
title={Drift Calibration Using Constrained Extreme Learning Machine and Kalman Filter in Clustered Wireless Sensor Networks},
author={Wu, Jiawen and Li, Guanghui},
journal={IEEE Access},
volume={8},
pages={13078--13085},
year={2019},
publisher={IEEE}
}
@article{barcelo2019,
title={Self-calibration methods for uncontrolled environments in sensor networks: A reference survey},
author={Barcelo-Ordinas, Jose M and Doudou, Messaoud and Garcia-Vidal, Jorge and Badache, Nadjib},
journal={Ad Hoc Networks},
volume={88},
pages={142--159},
year={2019},
publisher={Elsevier}
}
@article{dehkordi2020,
title={A survey on data aggregation techniques in IoT sensor networks},
author={Dehkordi, Soroush Abbasian and Farajzadeh, Kamran and Rezazadeh, Javad and Farahbakhsh, Reza and Sandrasegaran, Kumbesan and Dehkordi, Masih Abbasian},
journal={Wireless Networks},
volume={26},
number={2},
pages={1243--1263},
year={2020},
publisher={Springer}
}
@article{wang2016,
title={Blind drift calibration of sensor networks using sparse Bayesian learning},
author={Wang, Yuzhi and Yang, Anqi and Li, Zhan and Chen, Xiaoming and Wang, Pengjun and Yang, Huazhong},
journal={IEEE Sensors Journal},
volume={16},
number={16},
pages={6249--6260},
year={2016},
publisher={IEEE}
}
@inproceedings{buonadonna2005,
title={TASK: Sensor network in a box},
author={Buonadonna, Philip and Gay, David and Hellerstein, Joseph M and Hong, Wei and Madden, Samuel},
booktitle={Proceeedings of the Second European Workshop on Wireless Sensor Networks, 2005.},
pages={133--144},
year={2005},
organization={IEEE}
}
% noise
@inproceedings{elnahrawy2003,
title={Cleaning and querying noisy sensors},
author={Elnahrawy, Eiman and Nath, Badri},
booktitle={Proceedings of the 2nd ACM international conference on Wireless sensor networks and applications},
pages={78--87},
year={2003}
}
@article{stankovic2018,
title={On consensus-based distributed blind calibration of sensor networks},
author={Stankovi{\'c}, Milo{\v{s}} S and Stankovi{\'c}, Srdjan S and Johansson, Karl Henrik and Beko, Marko and Camarinha-Matos, Luis M},
journal={Sensors},
volume={18},
number={11},
pages={4027},
year={2018},
publisher={Multidisciplinary Digital Publishing Institute}
}
@inproceedings{kumar2013,
title={Automatic sensor drift detection and correction using spatial kriging and kalman filtering},
author={Kumar, Dheeraj and Rajasegarar, Sutharshan and Palaniswami, Marimuthu},
booktitle={2013 IEEE International Conference on Distributed Computing in Sensor Systems},
pages={183--190},
year={2013},
organization={IEEE}
}
@inproceedings{barcelo2018,
title={Calibrating low-cost air quality sensors using multiple arrays of sensors},
author={Barcelo-Ordinas, Jose M and Garcia-Vidal, Jorge and Doudou, Messaoud and Rodrigo-Mu{\~n}oz, Santiago and Cerezo-Llavero, Albert},
booktitle={2018 IEEE Wireless Communications and Networking Conference (WCNC)},
pages={1--6},
year={2018},
organization={IEEE}
}
@article{ramanathan2006,
title={Rapid deployment with confidence: Calibration and fault detection in environmental sensor networks},
author={Ramanathan, Nithya and Balzano, Laura and Burt, Marci and Estrin, Deborah and Harmon, Tom and Harvey, Charlie and Jay, Jenny and Kohler, Eddie and Rothenberg, Sarah and Srivastava, Mani},
year={2006}
}
@inproceedings{hasenfratz2012,
title={On-the-fly calibration of low-cost gas sensors},
author={Hasenfratz, David and Saukh, Olga and Thiele, Lothar},
booktitle={European Conference on Wireless Sensor Networks},
pages={228--244},
year={2012},
organization={Springer}
}
@article{maag2017,
title={SCAN: Multi-hop calibration for mobile sensor arrays},
author={Maag, Balz and Zhou, Zimu and Saukh, Olga and Thiele, Lothar},
journal={Proceedings of the ACM on Interactive, Mobile, Wearable and Ubiquitous Technologies},
The root problem of drift detection and correction is predicting sensor measurements. This can usually be accomplished in two ways:
This usually requires one or more time series of data and an algorithm which consumes these time series and produces a prediction for the value a sensor should measure next. The most commonly used model for this by far is called \emph{Kalman filtering}, which consists of two phases:
Given the previous state of knowledge at step $k-1$ (estimated system state and uncertainty), we calculate a prediction for the next system state and uncertainty. This is the prediction phase. We then observe a new (possibly skewed) measurement and compute our prediction of the actual current state and uncertainty (update phase). This algorithm is recursive in nature and can be calculated with limited hardware in real-time.
Kalman filters are based on a linear dynamical system on a discrete time domain. It represents the system state as vectors and matrices of real numbers. In order to use Kalman filters, the observed process must be modeled in a specific structure:
\begin{itemize}
\item $F_k$, the state transition model for the $k$-th step
\item $H_k$, the observation model for the $k$-th step
\item $Q_k$, the covariance of the process noise
\item $R_k$, the covariance of the observation noise
\item Sometimes a control input model $B_k$
\end{itemize}
These models must predict the true state $x$ and an observation $z$ in the $k$-th step according to:
\begin{align*}
x_k &= F_kx_{k-1} + B_ku_k + w_k \\
z_k &= H_kx_k+v_k
\end{align*}
Where $w_k$ and $v_k$ is noise conforming to a zero mean multivariate normal distribution $\mathcal{N}$ with covariance $Q_k$ and $R_k$ respectively ($w_k \sim \mathcal{N}(0,Q_k)$ and $z_k \sim \mathcal{N}(0,R_k) $).
The Kalman filter state is represented by two variables $\hat{x}_{k|j}$ and $P_{k|j}$ which are the state estimate and covariance at step $k$ given observations up to and including $j$.
When entering step $k$, we can now define the two phases. \textbf{Prediction phase:}
Where we predict the next state and calculate our confidence in that prediction. If we are now given our measurement $z_k$, we enter the next phase. \textbf{Update phase:}
The context of WSN introduces a lot of interesting new challenges, as nodes are often small devices running on battery power and cannot be do much computation on their own. Furthermore, in WSNs communication is often not perfect and messages can and will get lost during operation. Any protocols that incur additional communication must have a good justification, as communication is expensive. All these factors create a unique environment, in which not many existing solutions to the problem are applicable.
The context of WSN introduces a lot of interesting new challenges, as nodes are often small devices running on battery power and cannot be do much computation on their own. Furthermore, in WSNs communication is often not perfect and messages can and will get lost during operation. Any protocols that incur additional communication must have a good justification, as communication is expensive. All these factors create a unique environment, in which not many existing solutions to the problem are applicable.
In this paper, we will not discuss anomaly detection in hostile environments, or intrusion detection, but rather focus solely on anomaly detection in sensor data collected by the WSN.
In this paper, we will not discuss anomaly detection in hostile environments, or intrusion detection, but rather focus solely on anomaly detection in sensor data collected by the WSN.
There are many different approaches to anomaly detection, we will differentiate between centralized and decentralized approaches. An approach is considered centralized, when a large chunk of the computation is done at a single point, or at a later stage during analysis. A decentralized approach implies that a considerable amount of processing is done on the individual nodes, doing analysis on the fly. When analysis is done centralized, it is important to differentiate between online and offline detection. Online detection can run while the WSN is operating, while offline detection is done after the data is collected. Offline detection methods can often be modified to work online, but will require an existing dataset.
There are many different approaches to anomaly detection, we will differentiate between centralized and decentralized approaches. An approach is considered centralized, when a large chunk of the computation is done at a single point, or at a later stage during analysis. A decentralized approach implies that a considerable amount of processing is done on the individual nodes, doing analysis on the fly. When analysis is done centralized, it is important to differentiate between online and offline detection. Online detection can run while the WSN is operating, while offline detection is done after the data is collected. Online detection often reduces mission duration due to increased power consumption, but can have the opposite effect, if it can be used to eliminate a large amount of communication.
\subsection{Anomaly types}
\subsection{Anomaly types}
Furthermore we need to clarify the different kinds of anomalies that can occur in WSN datasets:
Furthermore we need to clarify the different kinds of anomalies that can occur in WSN datasets. Bosman et al. \cite{bosman2017} proposes four different kinds of anomalies that occur in WSN:
\begin{itemize}
\begin{itemize}
\item\emph{Spikes} are short changes with a large amplitude
\item\emph{Spikes or outliers} are short changes with a large amplitude
\item\emph{Noise} is an increase of variance over time
\item\emph{Noise} is (an increase of) variance over time
\item\emph{Drift} is an offset which increases over time
\item\emph{Drift} is an offset which increases over time
\item\emph{Constant} is a constant offset
\end{itemize}
No method can account for all four types of anomalies at once. Therefore we will look into sensor self-calibration, which removes drift and constant anomalies, followed by outlier detection to detect spikes. Working with noisy data is a problem in WSN, but we will not focus on methods of cleaning noisy data, as it is not in the scope of this survey. Elnahrawy et al. \cite{elnahrawy2003} and Barcelo et al. \cite{barcelo2019} are a great places to start, if you are interested in this topic.
A fifth anomaly type, \emph{sensor failure}, is commonly added to anomaly detection \cite{rajasegarar2008,chandola2009}. Since sensor failure often manifests in these four different ways mentioned above, and we are not interested in sensor fault prediction, detection and management here, faulty sensors will not be discussed further.
\section{Sensor drift and self-calibration}
Advancements in energy storage density, processing power and sensor availability have increased the possible mission time of many WSN. This increase in mission time, together with an increase in node count due to reduced part cost \cite{wang2016}, as well as the introduction of the Internet of Things (IoT) have brought forth new problems in sensor calibration and drift detection \cite{dehkordi2020}. Increasing the amount of collected data and the length of time over which it is collected introduces a need for better quality control of the sensors that data came from. Ni et al. \cite{ni2009} noticed drift as high as 200\% in soil CO$_2$ sensors, while Buonadonna et al. \cite{buonadonna2005} noticed that his light sensors (which were calibrated to the manufacturer's specification) were performing very poorly when measured against laboratory equipment. It is out of these circumstances, that the need arises for better and more frequent sensor calibration.
\caption{Categories of calibration approaches, from Barcelo-Ordinas et al. \cite{barcelo2019}}
\label{fig:calcats}
\end{figure*}
The field of self-calibration in WSN quite broad, in order to get an overview over all approaches Barcelo-Ordinas et al. \cite{barcelo2019} categorized each approach by seven different attributes (Figure \ref{fig:calcats}):
\begin{itemize}
\item\emph{Area of interest} distinguishes between \emph{micro} (calibrating sensors to minimize error to a single data point), and \emph{macro} (calibrating nodes to minimize error over a given area of nodes).
\item\emph{Number of sensors} determines if data from other sensors is used, so called \emph{sensor fusion}, or if is done with just a \emph{single sensor}.
\item\emph{Ground truth} specifies, if the calibration is done in relation to a known good sensor \emph{non-blind}, or without one \emph{blind}. If both calibrated and uncalibrated sensors are used, the approach is considered \emph{semi-blind}.
\item\emph{Position from reference} is the distance between the calibration target and the point where the reference data is collected. If data from the close neighborhood is used, the approach is considered \emph{collocated}. If instead nodes are calibrated hop-by-hop in an iterative fashion, it is called \emph{multi-hop}. In \emph{model-based} calibration, fixed ground truth sensors are used in combination with a model to predict sensor error.
\item\emph{Calibration time} distinguishes between \emph{pre/post-\break deployment calibration}, \emph{periodic} (calibration at given intervals) and \emph{opportunistic} (when nodes in a mobile network come into range of a calibration source).
\item\emph{Operation mode} is either \emph{offline} (calibration when the node is not used) and \emph{online} (calibration during normal operation).
\item\emph{Processing mode} divides the approaches into \emph{centralized} processing, meaining calibration parameters are calculated by a central node and then distributed over the network, and \emph{decentralized}, where a single node, or collection of nodes collaborate to calculate their calibration parameters.
\end{itemize}
\end{itemize}
Not all methods can detect all three types of anomalies equally, therefore we will note down if this was accounted for in each method and how good the detection was, for each given type.
This level of specialization requires it's own survey, which most recently was Barcelo-Ordinas et al. \cite{barcelo2019}. He categorizes 39 approaches into these attributes and discusses them in-depth. We will instead just look at some central problems and ideas to these approaches in detail:
\subsection{Problems in blind self-calibration approaches}
The central problem in self-calibration is predicting the error of a given sensor. Since this is such a broad problem, many different solutions exist.
Kumar et al. \cite{kumar2013} proposes a solution that uses no ground-truth sensors and can be used online in a distributed fashion. It uses spatial Kriging (gaussian interpolation) and Kalman filtering (a linear approximation model accounting for noise) on neighborhood data in order to reduce noise and remove drift. This solution suffers from accumulative error due to a missing ground truth, as the system has no point of reference or general model to rely on. The uncertainty of the model, and thereby the accumulative error can be reduced by increasing the number of sensors which are used. A common method for gaining more measurements is increasing network density \cite{wang2016}, or switching from a single-sensor approach to sensor fusion. barcelo-Ordinas et al. \cite{barcelo2018} explores the possibility of adding multiple copies of the same kind of sensor to each node.
Non-blind, also known as reference-based calibration approached rely on known-good reference information. They often rely on data from much more expensive sensors, which often come with restrictions on their use. One type of non-blind calibration is done in a laboratory setting (see\cite{ramanathan2006}), a known-good sensor is used with in a controllable environment. Other approaches can calibrate instantly with a calibrated sensor nearby \cite{hasenfratz2012}, enabling calibration of multiple nodes in quick succession.
Maag et al. \cite{maag2017} proposes a hybrid solution, where calibrated sensor arrays can be used to calibrate other non-calibrated arrays in a local network of air pollution sensors over multiple hops with minimal accumulative errors. They show 16-60\% lower error rates than other approaches currently in use.
When we speak of a centralized WSN, we mean, that there exists a central entity, called the \emph{base station}, where all data is delivered to. In our analysis, it is often assumed, that the base station does not have limits on its processing power. The base station will summarize the received data until it has a complete set and can then use this set to determine global outliers and other anomalies such as clock drift over the course of the whole operation, as it has a complete history for each given node. A centralized approach is not optimal in hostile environments, but that is not our focus here. Since this environment is closely related to the general field of anomaly detection, we will not go into much detail on these solution, instead focusing on covering just the basics.
When we speak of a centralized WSN, we mean, that there exists a central entity, called the \emph{base station}, where all data is delivered to. In our analysis, it is often assumed, that the base station does not have limits on its processing power. The base station will summarize the received data until it has a complete set and can then use this set to determine global outliers and other anomalies such as clock drift over the course of the whole operation, as it has a complete history for each given node. A centralized approach is not optimal in hostile environments, but that is not our focus here. Since this environment is closely related to the general field of anomaly detection, we will not go into much detail on these solution, instead focusing on covering just the basics.
\subsection{Statistical analysis}
\subsection{Statistical analysis}
Classical Statistical analysis is done by creating a model of the expected data and then finding the probability for each recorded data point. Improbable data points are then deemed outliers. The problem for many statistical approaches is finding this model of the expected data, as it's not always feasible to create it in advance. It also bears the problem of bad models or slow changes in the environment \cite{mcdonald2013}.
Classical Statistical analysis is done by creating a model of the expected data and then finding the probability for each recorded data point. Improbable data points are then deemed outliers. The problem for many statistical approaches is finding this model of the expected data, as it's not always feasible to create it in advance. It also bears the problem of bad models or slow changes in the environment \cite{mcdonald2013}.
Sheng et al. \cite{sheng2007} proposes a rather naive approach, where histograms of each node are polled, combined, and then analyzed for outliers by looking at the maximum distance a data point can be away from his nearest neighbors. This solution has several problems, as it incurs a considerable communication overhead and fails to account for non gaussian distribution. It also requires choosing new parameters every time the expected data changes suddenly.
Sheng et al. \cite{sheng2007} proposes a new approach, where histograms of each node are polled, combined, and then analyzed for outliers by looking at the maximum distance a data point can be away from his nearest neighbors. This solution has several problems, as it incurs a considerable communication overhead and fails to account for non gaussian distribution. Since the this approach uses fixed parameters, it also requires updating them every time the expected data changes.
Böhm et al. \cite{böhm2008} proposes a solution not only to non gaussian distributions, but also to noisy data. He defines a general probability distribution function (PDF) with an exponential distribution function (EDF) as a basis, which is better suited to fitting around non gaussian data as seen in figure \ref{fig:probdistböhm}. He then outlines an algorithm where the data is split into clusters, for each cluster an EDF is fitted and outliers are discarded.
Böhm et al. \cite{böhm2008} proposes a solution not only to non gaussian distributions, but also to noisy data. He defines a general probability distribution function (PDF) with an exponential distribution function (EDF) as a basis, which is better suited to fitting around non gaussian data as seen in figure \ref{fig:probdistböhm}. He then outlines an algorithm where the data is split into clusters, for each cluster an EDF is fitted and outliers are discarded.
@ -78,7 +105,7 @@ Böhm et al. \cite{böhm2008} proposes a solution not only to non gaussian distr
\label{fig:probdistböhm}
\label{fig:probdistböhm}
\end{figure}
\end{figure}
While there are many statistical methods for outlier detection, most follow a similar approach to at least one of the two methods shown here. Most of these are generally not as useful for online detection.
While there are many statistical methods for outlier detection, most follow a similar approach to at least one of the two methods shown here. Most of these are generally not as useful for online detection, as they require
\subsection{Density based analysis}
\subsection{Density based analysis}
Outliers can be selected by looking at the density of points as well. Breuning et al. \cite{breuning2000} proposes a method of calculating a local outlier factor (LOF) of each point based on the local density of its $n$ nearest neighbors. The problem lies in selecting good values for $n$. If $n$ is too small, clusters of outliers might not be detected, while a large $n$ might mark points as outliers, even if they are in a large cluster of $<n$ points. This problem is further exasperated when we try to use this in a WSN setting, for example by streaming through the last $k$ points, as cluster size will not stay constant as incoming data might be delayed or lost in transit.
Outliers can be selected by looking at the density of points as well. Breuning et al. \cite{breuning2000} proposes a method of calculating a local outlier factor (LOF) of each point based on the local density of its $n$ nearest neighbors. The problem lies in selecting good values for $n$. If $n$ is too small, clusters of outliers might not be detected, while a large $n$ might mark points as outliers, even if they are in a large cluster of $<n$ points. This problem is further exasperated when we try to use this in a WSN setting, for example by streaming through the last $k$ points, as cluster size will not stay constant as incoming data might be delayed or lost in transit.
@ -86,45 +113,54 @@ Outliers can be selected by looking at the density of points as well. Breuning e
Papadimitriou et al. \cite{papadimitriou2003} introduces a parameterless approach. The paper formulates a method using a local correlation integral (LOCI), which does not require parametrization. It uses a multi-granularity deviation factor (MDEF), which is the relative deviation for a point $p$ in a radius $r$. The MDEF is simply the number of nodes in an $r$-neighborhood divided by the sum of all points in the same neighborhood. LOCI provides an automated way to select good parameters for the MDEF and can detect outliers and outlier-clusters with comparable performance to other statistical approaches. They also formulate aLOCI, a linear approximation of LOCI, which also gives accurate results while reducing runtime.
Papadimitriou et al. \cite{papadimitriou2003} introduces a parameterless approach. The paper formulates a method using a local correlation integral (LOCI), which does not require parametrization. It uses a multi-granularity deviation factor (MDEF), which is the relative deviation for a point $p$ in a radius $r$. The MDEF is simply the number of nodes in an $r$-neighborhood divided by the sum of all points in the same neighborhood. LOCI provides an automated way to select good parameters for the MDEF and can detect outliers and outlier-clusters with comparable performance to other statistical approaches. They also formulate aLOCI, a linear approximation of LOCI, which also gives accurate results while reducing runtime.
Most machine learning approaches focus on outlier detection, which is a common problem in WSN, as an outlier is inherently an anomaly. Outlier detection is largely unable to detect drift and has difficulties wih noise, but excels at detecting data points or groups which appear to be inconsistent with the other data (spikes). A common problem is finding outliers in data with an inherently complex structure.
Most machine learning approaches focus on outlier detection, which is a common problem in WSN, as an outlier is inherently an anomaly. Outlier detection is largely unable to detect drift and has difficulties wih noise, but excels at detecting data points or groups which appear to be inconsistent with the other data (spikes). A common problem is finding outliers in data with an inherently complex structure.
It is impossible to create an exhaustive list of classifiers to define what is and isn't an anomaly. Therefore it is difficult to generate labeled training data for machine learning. Furthermore, the data generated by a WSN might change over time without being anomalous, requiring frequent retraining. Out of these circumstances arises the need for unsupervised anomaly detection methods.
It is impossible to create an exhaustive list of classifiers to define what is and isn't an anomaly. Therefore it is difficult to generate labeled training data for machine learning. Furthermore, the data generated by a WSN might change over time without being anomalous, requiring frequent retraining. Out of these circumstances arises the need for unsupervised anomaly detection methods.
We will look into a couple different approaches to outlier detection
We will look into a couple different approaches to outlier detection:
\subsection{Support vector machines (SVMs)}
\subsection{Support vector machines (SVMs)}
Rajasegarar et al. \cite{rajasegarar2010} uses SVMs, which leverage a kernel function to map the input space to a higher dimensional feature space. This allows the SVM to then model highly nonlinear patterns of normal behavior in a flexible manner. This means, that patterns that are difficult to classify in the problem space, become more easily recognizable and therefore classifiable in the feature space. Once the data is mapped into the feature space, hyperelipsoids are fitted to the data points to define regions of the feature space that classify the data as normal.
Rajasegarar et al. \cite{rajasegarar2010} uses SVMs, which leverage a kernel function to map the input space to a higher dimensional feature space. This allows the SVM to then model highly nonlinear patterns of normal behavior in a flexible manner. This means, that patterns that are difficult to classify in the problem space, become more easily recognizable and therefore classifiable in the feature space. Once the data is mapped into the feature space, hyperelipsoids are fitted to the data points to define regions of the feature space that classify the data as normal.
While this approach works well to find outliers in the data, it is also computationally expensive. In an attempt to To decrease computational complexity, only a single hyperelipsoid is fitted to the dataset. This method is called a one-class support vector machine. Originally Wang et al. \cite{wang2006} created a model of a one-class SVM (OCSVM), however the solution required the solution of a computationally complex second-order cone pro-gramming problem, making it unusable for distributed usage. Rajasegarar et al. \cite{rajasegarar2010} improved on this OCSVM in a couple of ways.
While this approach works well to find outliers in the data, it is also computationally expensive and incurs a large communication overhead. In an attempt to To decrease computational complexity, only a single hyperelipsoid is fitted to the dataset. This method is called a one-class support vector machine. Originally Wang et al. \cite{wang2006} created a model of a one-class SVM (OCSVM), however the solution required the solution of a computationally complex second-order cone pro-gramming problem, making it unusable for distributed usage. Rajasegarar et al. \cite{rajasegarar2007, rajasegarar2010} improved on this OCSVM in a couple of ways.
They used the fact, that they could normalize numerical input data to lay in the vicinity of the origin inside the feature space. They furthermore used the results of Laskov et al. \cite{laskov2004} which showed, that normalized numerical data is one-sided, always lying in the positive quadrants, to formulate a centered-hyperelipsoidal SVM (CESVM) which vastly reduces computational complexity to a linear problem. Furthermore they introduce a one-class quarter-sphere SVM (QSSVM) with reduced communication. They conclude however, that the technique ist still unfit for decentralized use because of the large remaining communication overhead, as a consensus for the radiuses and other parameters is still required.
They used the fact, that they could normalize numerical input data to lay in the vicinity of the origin inside the feature space, and furthermore the results of Laskov et al. \cite{laskov2004} which showed, that normalized numerical data is one-sided, always lying in the positive quadrants. This lead to the formulation of a centered-hyperelipsoidal SVM (CESVM) model, which vastly reduces computational complexity to a linear problem. Furthermore they introduce a one-class quarter-sphere SVM (QSSVM) which reduced the communication overhead. They conclude however, that the technique ist still unfit for decentralized use because of the large remaining communication overhead, as a consensus for the radiuses and other parameters is still required.
The QSSVM was improved in 2012 by Shahid et al. \cite{shahid2012a}, \cite{shahid2012b}, proposing three schemes that reduce communication overhead while maintaining detection performance. His propositions make use of the spatio-temporal \& attribute (STA) correlations in the measured data. These propositions accept worse consensus about the placement of the hypersphere among neighboring nodes in order to reduce the communication overhead. He then shows, that his approaches are comparable in performance to the QSSVM proposed by Rajasegarar et al. if the data correlates well enough inside each neighborhood. It is important to note, that neighborhood information relies on nodes being mostly stationary, as the assumptions on the STA correlations will fail in a shifting network topology.
The QSSVM was improved in 2012 by Shahid et al. \cite{shahid2012a, shahid2012b}, proposing three schemes that reduce communication overhead while maintaining detection performance. His propositions make use of the spatio-temporal \& attribute (STA) correlations in the measured data. These propositions accept worse consensus about the placement of the hypersphere among neighboring nodes in order to reduce the communication overhead. He then shows, that his approaches are comparable in performance to the QSSVM proposed by Rajasegarar et al. if the data correlates well enough inside each neighborhood. It is important to note, that this neighborhood information does not rely on nodes being stationary and is therefore usable in a shifting network topology.
As far as we are aware, there are no SVM solutions that solve the problem of a dynamic network topology.
\subsection{Generalized Hebbian Algorithm}
Ali et al. \cite{ali2015} proposes an approach to detect and identify events using Generalized Hebbian Algorithm (GHA). Event detection is important in anomaly detection, but event identification is almost equally as important, especially when a sensor network is used to detect an event spanning multiple nodes. They propose a combined algorithm to detect, identify and communicate events in a WSN to detect local and global events. This is achieved by calculating identification ratios, i.e. the percentage each attribute contributed to the event, before broadcasting the detected event.
They start off with an outlier detection scheme using hyper-ellipsoids fitted around 98\% of their data points to detect outliers, using an iterative boundary estimation model based on the model formulated by by Moshtaghi et al. \cite{moshtaghi2011} called Forgetting Factor Iterative Data Capture Anomaly Detection (FFIDCAD). It can compute multidimensional boundaries of of the local model online in an iterative fashion, reducing the amount of required computation immensely, while also working in non-stationary environments and changing network topology due to its forgetting factor. A local event is declared, after observing more than $q$ outliers in a row, where $q$ is chosen depending on sampling rate and required temporal resolution.
Once an event is detected, Ali et al. proposes using a Generalized Hebbian Algorithm (GHA) to replace the Eigenvalue Decomposition (EVD) commonly used in offline identification schemes. EVD requires large batches of measurements to accurately compute principal components, while GHA can work online in a streaming fashion. They further show, that their online GHA bases approach has similar accuracy to offline EVD based techniques, while vastly reducing computational complexity. Once the eigenvectors are calculated, the last measurement is projected onto the calculated eigenvectors and whitened, creating a vector containing the identification ratios for each attribute.
\subsection{Generalized hebbian algorithm}
Ali et al. claims that his algorithm has complexity of $\mathcal{O}(nd^2)$, compared to $\mathcal{O}(n^2+nd^2)$ of common SVM based approaches \cite{shahid2012a,shahid2012b}. Here $n$ is the number of measurements and $d$ is the number of attributes. Furthermore, due to the online nature of this approach, communication overhead is much lower, as only detected local events have to be broadcast, instead of the ongoing exchange of support vectors that have to be broadcast in the aforementioned SVM approaches.
coming soon \cite{ali2015}
\subsection{Principal Component Analysis}
\cite{oreilly2016}
\subsection{Extreme learning}
\section{Decentralized approaches using passive neighborhood data}
When working decentralized with no additional overhead, it is still possible to obtain additional data, just by listening to other nodes broadcasts. This data can be fed into various prediction models which can then be used to calculate a confidence level for the nodes own measurements.
When working decentralized with no additional overhead, it is still possible to obtain additional data, just by listening to other nodes broadcasts. This data can be fed into various prediction models which can then be used to calculate a confidence level for the nodes own measurements.
Bosman et al. \cite{bosman2017} looks at the performance of recursive last squares (RLS) and the online sequential extreme learning machine (OS-ELM) approach to train a single-layer feed-forward neural network (SLFN). These are compared to first degree polynomial function approximation (FA) and sliding window mean prediction. The article shows, that incorporation neighborhood information improves anomaly detection only in cases where the dataset is well-correlated and shows low spatial entropy, as is common in most natural monitoring applications. When the dataset does not correlate well, or there is too much spatial entropy, the methods described in this paper fail to predict anomalies. It concludes, that neighborhood aggregation is not useful beyond 5 neighbors, as such a large dataset will fail to meet the aforementioned conditions. The exact size of the optimal neighborhood will vary with network topology and sensor modality.
Bosman et al. \cite{bosman2017} looks at the performance of recursive last squares (RLS) and the online sequential extreme learning machine (OS-ELM) approach to train a single-layer feed-forward neural network (SLFN). These are compared to first degree polynomial function approximation (FA) and sliding window mean prediction. The article shows, that incorporation neighborhood information improves anomaly detection only in cases where the data set is well-correlated and shows low spatial entropy, as is common in most natural monitoring applications. When the data set does not correlate well, or there is too much spatial entropy, the methods described in this paper fail to predict anomalies. It concludes, that neighborhood aggregation is not useful beyond 5 neighbors, as such a large data set will fail to meet the aforementioned conditions. The exact size of the optimal neighborhood will vary with network topology and sensor modality.
Here, all four types of anomalies were accounted for in the data set, but there was no analysis, how good the detection was for each kind of anomaly.
\subsection{Deep learning}
Supervised learning is the process of training a neural network on a set of labeled data. Acquiring labeled data sets that are applicable to the given situation is often difficult, as it requires the existence of another classification method, or labeling by hand. Furthermore, even if a data set would exist, the class imbalance (total number of positive labels vs number of negative labels) would render such training data sub-optimal. These restrictions prove prohibitively when compared to semi-supervised or unsupervised learning approaches and won't be covered in this survey.
\subsubsection{Semi-Supervised deep anomaly detection}
Here, all three types of anomalies were accounted for in the dataset, but there was no analysis, how good the detection was for each kind of anomaly.
\section{Non-stationary data}\cite{oreilly2014}
\section{Non-stationary data}\cite{oreilly2014}
\section{Conclusion}
\section{Conclusion}
Anomaly detection in WSN is a relatively new addition to the general field of anomaly detection, but has already become a rather complex landscape of solutions, as many experts in their respective fields have used their knowledge to find solutions to these new problems. This survey attempts to capture this diversity in methods and introduces many fundamentally different approaches. In order to organize approaches, we first defined the four anomaly types that are expected in WSNs, and then looked at methods that detect or remove these.
At first we looked at solutions for sensor drift and offset, and found that while sensor calibration is an important step in preventing these, calibration in the field is often not feasible. We then looked at some other ways to compensate for sensor drift in data sets.
@ -133,14 +169,3 @@ Here, all three types of anomalies were accounted for in the dataset, but there