diff --git a/References.bib b/References.bib index 26e5b00..7e3afcf 100644 --- a/References.bib +++ b/References.bib @@ -34,7 +34,18 @@ volume = {}, number = {}, year = {2008} +} +@article{yu2017, + title={Recursive principal component analysis-based data outlier detection and sensor data aggregation in IoT systems}, + author={Yu, Tianqi and Wang, Xianbin and Shami, Abdallah}, + journal={IEEE Internet of Things Journal}, + volume={4}, + number={6}, + pages={2207--2216}, + year={2017}, + publisher={IEEE} } + @article{wang2006, author = {Defeng Wang and Daniel S. Yeung and Eric C.C. Tsang}, journal = {IEEE transactions on systems, man, and cybernetics - Part B: cybernetics}, @@ -152,6 +163,103 @@ year={2011}, organization={IEEE} } +@article{ramotsoela2018, + title={A survey of anomaly detection in industrial wireless sensor networks with critical water system infrastructure as a case study}, + author={Ramotsoela, Daniel and Abu-Mahfouz, Adnan and Hancke, Gerhard}, + journal={Sensors}, + volume={18}, + number={8}, + pages={2491}, + year={2018}, + publisher={Multidisciplinary Digital Publishing Institute} +} +@inproceedings{bosman2013, + title={Online extreme learning on fixed-point sensor networks}, + author={Bosman, Hedde HWJ and Liotta, Antonio and Iacca, Giovanni and W{\"o}rtche, Heinrich J}, + booktitle={2013 IEEE 13th International Conference on Data Mining Workshops}, + pages={319--326}, + year={2013}, + organization={IEEE} +} +@inproceedings{macua2010, + title={Consensus-based distributed principal component analysis in wireless sensor networks}, + author={Macua, Sergio Valcarcel and Belanovic, Pavle and Zazo, Santiago}, + booktitle={2010 IEEE 11th International Workshop on Signal Processing Advances in Wireless Communications (SPAWC)}, + pages={1--5}, + year={2010}, + organization={IEEE} +} +@article{chan2012, + title={Robust recursive eigendecomposition and subspace-based algorithms with application to fault detection in wireless sensor networks}, + author={Chan, Shing-Chow and Wu, HC and Tsui, Kai Man}, + journal={IEEE Transactions on Instrumentation and Measurement}, + volume={61}, + number={6}, + pages={1703--1718}, + year={2012}, + publisher={IEEE} +} +@article{li2000, + title={Recursive PCA for adaptive process monitoring}, + author={Li, Weihua and Yue, H Henry and Valle-Cervantes, Sergio and Qin, S Joe}, + journal={Journal of process control}, + volume={10}, + number={5}, + pages={471--486}, + year={2000}, + publisher={Elsevier} +} +@inproceedings{tien2004, + title={Comparative study of PCA approaches in process monitoring and fault detection}, + author={Tien, Doan X and Lim, K-W and Jun, Liu}, + booktitle={30th Annual Conference of IEEE Industrial Electronics Society, 2004. IECON 2004}, + volume={3}, + pages={2594--2599}, + year={2004}, + organization={IEEE} +} +@book{jolliffee2002, + author = {I.T. Jolliffe}, + title = {Principal Component Analysis}, + publisher = {Springer}, + year = {2002} +} +@article{chalapathy2019, + title={Deep learning for anomaly detection: A survey}, + author={Chalapathy, Raghavendra and Chawla, Sanjay}, + journal={arXiv preprint arXiv:1901.03407}, + year={2019} +} +@article{kiran2018, + title={An overview of deep learning based methods for unsupervised and semi-supervised anomaly detection in videos}, + author={Kiran, B Ravi and Thomas, Dilip Mathew and Parakkal, Ranjith}, + journal={Journal of Imaging}, + volume={4}, + number={2}, + pages={36}, + year={2018}, + publisher={Multidisciplinary Digital Publishing Institute} +} +@article{zhang2018, + title={LSTM-based analysis of industrial IoT equipment}, + author={Zhang, Weishan and Guo, Wuwu and Liu, Xin and Liu, Yan and Zhou, Jiehan and Li, Bo and Lu, Qinghua and Yang, Su}, + journal={IEEE Access}, + volume={6}, + pages={23551--23560}, + year={2018}, + publisher={IEEE} +} + + +@article{mohanty2020, + title={Deep learning with LSTM based distributed data mining model for energy efficient wireless sensor networks}, + author={Mohanty, Sachi Nandan and Lydia, E Laxmi and Elhoseny, Mohamed and Al Otaibi, Majid M Gethami and Shankar, K}, + journal={Physical Communication}, + volume={40}, + pages={101097}, + year={2020}, + publisher={Elsevier} +} % drift diff --git a/img/anomaly_types.png b/img/anomaly_types.png new file mode 100644 index 0000000..2985380 Binary files /dev/null and b/img/anomaly_types.png differ diff --git a/img/image--001.jpg b/img/image--001.jpg new file mode 100644 index 0000000..b095167 Binary files /dev/null and b/img/image--001.jpg differ diff --git a/img/image--002.jpg b/img/image--002.jpg new file mode 100644 index 0000000..1e2bd56 Binary files /dev/null and b/img/image--002.jpg differ diff --git a/img/lstm_pump_predictions.png b/img/lstm_pump_predictions.png new file mode 100644 index 0000000..2fe2864 Binary files /dev/null and b/img/lstm_pump_predictions.png differ diff --git a/paper.tex b/paper.tex index 406cc77..bc5ccf4 100644 --- a/paper.tex +++ b/paper.tex @@ -8,6 +8,11 @@ % Adjust this to the language used. \usepackage[british]{babel} +% use nice mathematical symbols +\usepackage{amsfonts} +\newcommand{\R}{\mathbb{R}} +\newcommand{\Oc}{\mathcal{O}} + \begin{document} @@ -20,7 +25,6 @@ \semester{Sommersemester 2020} - \author{Anton Lydike} \affiliation{\institution{Universität Augsburg}} @@ -41,16 +45,22 @@ There are many different approaches to anomaly detection, we will differentiate between centralized and decentralized approaches. An approach is considered centralized, when a large chunk of the computation is done at a single point, or at a later stage during analysis. A decentralized approach implies that a considerable amount of processing is done on the individual nodes, doing analysis on the fly. When analysis is done centralized, it is important to differentiate between online and offline detection. Online detection can run while the WSN is operating, while offline detection is done after the data is collected. Online detection often reduces mission duration due to increased power consumption, but can have the opposite effect, if it can be used to eliminate a large amount of communication. \subsection{Anomaly types} -Furthermore we need to clarify the different kinds of anomalies that can occur in WSN data sets. Bosman et al. \cite{bosman2017} proposes four different kinds of anomalies that occur in WSN: +Furthermore we need to clarify the different kinds of anomalies that can occur in WSN data sets. Bosman et al. \cite{bosman2017} proposes four different kinds of anomalies that occur in WSN (see also figure \ref{fig:noisetypes}): \begin{itemize} - \item \emph{Spikes or outliers} are short changes with a large amplitude - \item \emph{Noise} is (an increase of) variance over time + \item \emph{Spikes} are short changes with a large amplitude + \item \emph{Noise} is (an increase of) variance over a given time + \item \emph{Constant} is a the sudden absence of noise \item \emph{Drift} is an offset which increases over time - \item \emph{Constant} is a constant offset \end{itemize} -No method can account for all four types of anomalies at once. Therefore we will look into sensor self-calibration, which removes drift and constant anomalies, followed by outlier detection to detect spikes. Working with noisy data is a problem in WSN, but we will not focus on methods of cleaning noisy data, as it is not in the scope of this survey. Elnahrawy et al. \cite{elnahrawy2003} and Barcelo et al. \cite{barcelo2019} are a great places to start, if you are interested in this topic. +\begin{figure} + \includegraphics[width=8.5cm]{img/anomaly_types.png} + \caption{Spike, noise, constant and drift type anomalies in noisy linear data, image from Bosmal et al. \cite{bosman2013}} + \label{fig:noisetypes} +\end{figure} + +We will look into sensor self-calibration, which often removes or reduces drift and constant offsets, followed by outlier detection to detect spikes, noise and drift type anomalies. A Noise anomaly is not the same as a noisy sensor, working with noisy data is a problem in WSN, but we will not focus on methods of cleaning noisy data, as it is not in the scope of this survey. Elnahrawy et al. \cite{elnahrawy2003} and Barcelo et al. \cite{barcelo2019} are a great places to start, if you are interested in this topic. A fifth anomaly type, \emph{sensor failure}, is commonly added to anomaly detection \cite{rajasegarar2008,chandola2009}. Since sensor failure often manifests in these four different ways mentioned above, and we are not interested in sensor fault prediction, detection and management here, faulty sensors will not be discussed further. @@ -87,10 +97,11 @@ Non-blind, also known as reference-based calibration approached rely on known-go Maag et al. \cite{maag2017} proposes a hybrid solution, where calibrated sensor arrays can be used to calibrate other non-calibrated arrays in a local network of air pollution sensors over multiple hops with minimal accumulative errors. They show 16-60\% lower error rates than other approaches currently in use. +\subsection{Other methods for drift-correction} -\section{Outlier detection - Centralized model-based approaches} -When we speak of a centralized WSN, we mean, that there exists a central entity, called the \emph{base station}, where all data is delivered to. In our analysis, it is often assumed, that the base station does not have limits on its processing power. The base station will summarize the received data until it has a complete set and can then use this set to determine global outliers and other anomalies such as clock drift over the course of the whole operation, as it has a complete history for each given node. A centralized approach is not optimal in hostile environments, but that is not our focus here. Since this environment is closely related to the general field of anomaly detection, we will not go into much detail on these solution, instead focusing on covering just the basics. +\section{Outlier detection - model-based approaches} +When we speak of a centralized WSN, we mean, that there exists a central entity, called the \emph{base station} or \emph{fusion centre}, where all data is delivered to and analyzed. It is often assumed, that the base station does not have limits on its processing power or storage. Centralized approaches are not optimal in hostile environments, but that is not our focus here. Since central anomaly detection is closely related to the general field of anomaly detection, we will not go into much detail on these solution, instead focusing on covering solutions more specific to the field of WSN. \subsection{Statistical analysis} Classical Statistical analysis is done by creating a model of the expected data and then finding the probability for each recorded data point. Improbable data points are then deemed outliers. The problem for many statistical approaches is finding this model of the expected data, as it's not always feasible to create it in advance. It also bears the problem of bad models or slow changes in the environment \cite{mcdonald2013}. @@ -105,25 +116,38 @@ Böhm et al. \cite{böhm2008} proposes a solution not only to non gaussian distr \label{fig:probdistböhm} \end{figure} -While there are many statistical methods for outlier detection, most follow a similar approach to at least one of the two methods shown here. Most of these are generally not as useful for online detection, as they require - \subsection{Density based analysis} Outliers can be selected by looking at the density of points as well. Breuning et al. \cite{breuning2000} proposes a method of calculating a local outlier factor (LOF) of each point based on the local density of its $n$ nearest neighbors. The problem lies in selecting good values for $n$. If $n$ is too small, clusters of outliers might not be detected, while a large $n$ might mark points as outliers, even if they are in a large cluster of $