参考博客:参考
算法简介: 该算法比较简单,需要人为设置两个阈值,一个是距离阈值distance,一个是分数阈值score。算法伪代码如下:
C++实现(支持多维数据,会对数据进行归一化操作,归一化参考):
#include <iostream>
#include <vector>
#include <cmath>
// support multidimensional data
std::vector<double> OutlierDetect(double* src, int src_rows, int src_cols) {
// save data to vector
std::vector<double> vec_src;
for(int i = 0; i < src_rows; ++i) {
int shift = i * src_cols;
for(int j = 0; j < src_cols; ++j) {
vec_src.push_back(src[shift + j]);
}
}
// get the max and the min in each col
std::vector<double> vec_max(src_cols, -1000);
std::vector<double> vec_min(src_cols, 1000);
for(int i = 0; i < src_cols; ++i) {
for(int j = 0; j < src_rows; ++j) {
int shift = j * src_cols;
if(vec_src[shift + i] > vec_max[i]) {
vec_max[i] = vec_src[shift + i];
}
if(vec_src[shift + i] < vec_min[i]) {
vec_min[i] = vec_src[shift + i];
} // end if
} // end for loop
} // end for loop
// normalized: min-max
for(int i = 0; i < src_cols; ++i) {
for(int j = 0; j < src_rows; ++j) {
int shift = j * src_cols;
vec_src[shift + i] =
(vec_src[shift + i] - vec_min[i]) / (vec_max[i] - vec_min[i]);
} // end if
} // end for loop
/**********************distance-based outlier detection********************/
double distance = 0.01;
double score = 0.5; // 0 < score < 1
std::vector<double> outliers;
std::vector<int> outliers_index;
std::vector<double> inliers;
std::vector<int> inliers_index;
int count_temp;
int shift_i;
int shift_j;
for(int i = 0; i < src_rows; ++i) {
count_temp = 0;
for(int j = 0; j < src_rows; ++j) {
if(i == j) {
continue;
}
shift_i = i * src_cols;
shift_j = j * src_cols;
double dist = 0;
for(int k = 0; k < src_cols; ++k) {
double dist_temp = vec_src[shift_i + k] - vec_src[shift_j + k];
dist += pow(dist_temp, 2); // Euclidean distance
}
if(sqrt(dist) < distance) {
count_temp++;
}
}
// save outliers and its index
if(count_temp < score * src_rows) {
outliers_index.push_back(i);
for(int l = 0; l < src_cols; ++l) {
outliers.push_back(vec_src[shift_i + l]);
}
}
else {
inliers_index.push_back(i);
for(int l = 0; l < src_cols; ++l) {
inliers.push_back(vec_src[shift_i + l]);
}
}
} // end for loop
/**********************distance-based outlier detection********************/
return inliers;
// return outliers;
}
代码未经验证,仅供参考。