LOF算法是一种基于密度的无监督离群点检测算法,其核心思想是:通过比较对象xi与其邻居密度的相似性程度,如果越不相似,即xi的LOF值越大于1,则其是离群点的可能性越高。
下面给出了LOF算法的Matlab版本实现,可直接粘贴复制,并将代码稍作修改即可运行。
function [outputArg1,outputArg2] = LOF(inputArg1,inputArg2)
x=load ('Normalization_wbc.txt');%装载要检测的数据集
Label=load('Label_wbc.txt');%数据集X所对应的标签
ADLabels=load('Label_wbc.txt');%这是为了计算AUC值加的标签
%样本个数为n,后续计算混淆矩阵是要用的,别和其他算法混淆
n=size(x,1);
%近邻个数
k=20;
%异常对象个数
Abnormal_number=20;
x2=sum(x.^2,2);%按列求和 便于后续步骤进行计算
[s,t]=sort(sqrt(repmat(x2,1,n)+repmat(x2',n,1)-2*x*x'),2);
%进行异常检测
for i=1:k+1
for j=1:k
RD(:,j)=max(s(t(t(:,i),j+1),k),s(t(:,i),j+1));
end
LRD(:,i)=1./mean(RD,2);
end
LOF=mean(LRD(:,2:k+1),2)./LRD(:,1);
[value_outlier,index_outlier]=sort(LOF,'ascend');
%LOF算法最终判定的离群点对象编号
outlier=index_outlier(1:Abnormal_number,:);
auc = Measure_AUC(LOF, ADLabels);
disp(auc)
ODA_AbnormalObject_Number=index_outlier(n-Abnormal_number+1:end,:);%outlier detection algorithm 算法认定的异常对象的编号
ODA_NormalObject_Number=index_outlier(1:n-Abnormal_number,:);%outlier detection algorithm算法认定的正常对象的编号
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%算法实际的检测率/准确率/误报率等评价指标的计算%%%%%%%%%%%%%%%%%%%%%%%%
%%%%Real_NormalObject_Number表示数据集中真正的正常对象的编号,Real_AbnormalObject_Number表示数据集中真正异常对象的编号
[Real_NormalObject_Number,Real_Normal]=find(Label==0);
[Real_AbnormalObject_Number,Real_Abnormal]=find(Label==1);
%正例是异常对象,反例是正常对象
TP=length(intersect(Real_AbnormalObject_Number,ODA_AbnormalObject_Number));
FP=length(Real_AbnormalObject_Number)-TP;
TN=length(intersect(Real_NormalObject_Number,ODA_NormalObject_Number));
FN=length(Real_NormalObject_Number)-TN;
%准确率
ACC=(TP+TN)/(TP+TN+FP+FN);
fprintf('准确率ACC= %8.5f\n',ACC*100)
%检测率==查全率=R
DR=TP/(TP+FN);
fprintf('检测率DR= %8.5f\n',DR*100)
%查准率P
P=TP/(TP+FP);
fprintf('查准率P= %8.5f\n',P*100)
%误报率
FAR=FP/(TN+FP);
fprintf('误报率FAR= %8.5f\n',FAR*100)
%绘制混淆矩阵
Confusion_matrix=[TP,FN;FP,TN];
Figure_Confusion_matrix=heatmap(Confusion_matrix);
end
请配合计算AUC值的函数一起使用(如果不需要计算AUC,可在代码中注释掉即可运行)
function AccumAuc = Measure_AUC(Scores, Labels)
% Area Under Curve for Amonaly
%
% Scores: predicted scores;
% Labels: groundtruth labels, PosLabel = 1& NegLabel = 0;
NumInst = length(Scores);
% sort Scores and Labels
[Scores, index] = sort(Scores, 'descend');
Labels = Labels(index);
PosLabel = 1;
NegLabel = 0;
NumPos = length(find(Labels == PosLabel));
NumNeg = length(find(Labels == NegLabel));
AccumPos = 0;
AccumNeg = 0;
AccumAuc = 0;
UnitPos = 1 / NumPos;
UnitNeg = 1 / NumNeg;
i = 1;
while i <= NumInst
temp = AccumPos;
if (i < NumInst - 1) && (Scores(i) == Scores(i + 1))
while (i < NumInst - 1) && (Scores(i) == Scores(i + 1))
if Labels(i) == NegLabel
AccumNeg = AccumNeg + 1;
elseif Labels(i) == PosLabel
AccumPos = AccumPos + 1;
else
disp('Label is not defined!');
end
i = i + 1;
end
if Labels(i) == NegLabel
AccumNeg = AccumNeg + 1;
elseif Labels(i) == PosLabel
AccumPos = AccumPos + 1;
else
disp('Label is not defined!');
end
AccumAuc = AccumAuc + (AccumPos + temp) * UnitPos * AccumNeg * UnitNeg / 2;
AccumNeg = 0;
else
if Labels(i) == NegLabel
AccumNeg = AccumNeg + 1;
AccumAuc = AccumAuc + AccumPos * UnitPos * AccumNeg * UnitNeg;
AccumNeg = 0;
elseif Labels(i) == PosLabel
AccumPos = AccumPos + 1;
else
disp('Label is not defined');
end
end
i = i + 1;
end