局部异常因子算法-Local Outlier Factor(LOF)

最新推荐文章于 2023-02-08 17:36:05 发布

qq_43104025

最新推荐文章于 2023-02-08 17:36:05 发布

阅读量2.3k

点赞数 1

工程应用领域：
欺诈检测：信用卡的不正当行为，如信用卡、社会保障的欺诈行为或者是银行卡、电话卡的欺诈使用等
工业检测：如计算机网络的非法访问等
活动监控：通过实时检测手机活跃度或者是股权市场的可疑交易，从而实现检测移动手机诈骗行为等
网络性能：计算机网络性能检测（稳健性分析），检测网络堵塞情况等
自然生态应用领域：生态系统失调、异常自然气候的发现等
公共服务领域：公共卫生中的异常疾病的爆发、公共安全中的突发事件的发生等

"""
Created on Mon Sep  3 10:44:10 2018

@author: lenovo
"""
def localoutlierfactor(data, predict, k):
    from sklearn.neighbors import LocalOutlierFactor
    clf = LocalOutlierFactor(n_neighbors=k, algorithm='auto', contamination=0.05, n_jobs=-1)
    clf.fit(data)
    # 记录 k 邻域距离
    predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1)
    # 记录 LOF 离群因子，做相反数处理
    predict['local outlier factor'] = -clf._decision_function(predict.iloc[:, :-1])
    return predict

def plot_lof(result, method):
    import matplotlib.pyplot as plt
    plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定字体,用来正常显示中文标签
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    plt.figure(figsize=(8, 4)).add_subplot(111)
    index = result[result['local outlier factor'] > method].index
    x = result.loc[index][0]
    y = result.loc[index][1]
    plt.scatter(x,y, c='red', s=50,
                marker='.', alpha=None,
                label='离群点')
    index = result[result['local outlier factor'] <= method].index
    x = result.loc[index][0]
    y = result.loc[index][1]
    plt.scatter(x,y, c='blue', s=50,
                marker='.', alpha=None, label='正常点')
    #plt.hlines(method, 0, 2 + max(result.index), linestyles='--')#plt.vlines(x,y_min,y_max)和plt.hlines(y,x_min,x_max)
    #plt.xlim(-2, 2 + max(result.index))
    #plt.hlines(method, 0, 2, linestyles='--')
    plt.xlim(0, 1)
    plt.title('LOF局部离群点检测', fontsize=13)
    #plt.ylabel('局部离群因子', fontsize=15)
    plt.legend()#图例设置
    plt.show()



def lof(data, predict=None, k=12, method=1, plot=True):
    import pandas as pd
    # 判断是否传入测试数据，若没有传入则测试数据赋值为训练数据
    try:
        if predict == None:
            predict = data.copy()
    except Exception:
        pass
    predict = pd.DataFrame(predict)
    # 计算 LOF 离群因子
    predict = localoutlierfactor(data, predict, k)
    if plot == True:
        plot_lof(predict, method)
    # 根据阈值划分离群点与正常点
    outliers = predict[predict['local outlier factor'] > method].sort_values(by='local outlier factor')
    inliers = predict[predict['local outlier factor'] <= method].sort_values(by='local outlier factor')
    return outliers, inliers,predict

import numpy as np
import pandas as pd
shuju=pd.read_csv('C:\\Users\\lenovo\\Desktop\\AA.csv',sep=",")
lon = np.array(shuju["single"][:])  # x1
lat = np.array(shuju["mulit"][:])  # x2
A = list(zip(lat, lon))  # 按x1-x2匹配
a=[]
b=[]
for index,row in shuju.iterrows():
  #print (row)
  #a=((row[0],row[1]))  
  a.append(row[0])  
  b.append(row[1])  
# 获取任务密度，取第5邻域，阈值为2（LOF大于1.3认为是离群值）
outliers1, inliers1, predict = lof(A, k=12, method = 1.3)
print (outliers1, inliers1,predict)

lof_value = predict["local outlier factor"]
#df =pd.DataFrame(shuju.index,a,b,lof_value)
index = list(shuju.index)
lofV = list(lof_value)
df=pd.DataFrame(b,a)

df['index'] =index
df['lof'] =lofV
#df.columns
df_index = shuju.index 
df = df.drop('index',axis=1)
df.insert(1,'index',df_index) #insert 三个参数，插到第几列，该列列名，值

#df = pd.DataFrame([b],[a],[index],[lofV])
df.to_csv("aaa.csv")

#pd.DataFrame([[1, 'w', 3],[4, 5, 6],[1, 2, 3],[4, 5, 6]], columns=['col1','col2','col3'])
#predict.columns