工程应用领域:
欺诈检测:信用卡的不正当行为,如信用卡、社会保障的欺诈行为或者是银行卡、电话卡的欺诈使用等
工业检测:如计算机网络的非法访问等
活动监控:通过实时检测手机活跃度或者是股权市场的可疑交易,从而实现检测移动手机诈骗行为等
网络性能:计算机网络性能检测(稳健性分析),检测网络堵塞情况等
自然生态应用领域:生态系统失调、异常自然气候的发现等
公共服务领域:公共卫生中的异常疾病的爆发、公共安全中的突发事件的发生等
"""
Created on Mon Sep 3 10:44:10 2018
@author: lenovo
"""
def localoutlierfactor(data, predict, k):
from sklearn.neighbors import LocalOutlierFactor
clf = LocalOutlierFactor(n_neighbors=k, algorithm='auto', contamination=0.05, n_jobs=-1)
clf.fit(data)
# 记录 k 邻域距离
predict['k distances'] = clf.kneighbors(predict)[0].max(axis=1)
# 记录 LOF 离群因子,做相反数处理
predict['local outlier factor'] = -clf._decision_function(predict.iloc[:, :-1])
return predict
def plot_lof(result, method):
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 指定字体,用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
plt.figure(figsize=(8, 4)).add_subplot(111)
index = result[result['local outlier factor'] > method].index
x = result.loc[index][0]
y = result.loc[index][1]
plt.scatter(x,y, c='red', s=50,
marker='.', alpha=None,
label='离群点')
index = result[result['local outlier factor'] <= method].index
x = result.loc[index][0]
y = result.loc[index][1]
plt.scatter(x,y, c='blue', s=50,
marker='.', alpha=None, label='正常点')
#plt.hlines(method, 0, 2 + max(result.index), linestyles='--')#plt.vlines(x,y_min,y_max)和plt.hlines(y,x_min,x_max)
#plt.xlim(-2, 2 + max(result.index))
#plt.hlines(method, 0, 2, linestyles='--')
plt.xlim(0, 1)
plt.title('LOF局部离群点检测', fontsize=13)
#plt.ylabel('局部离群因子', fontsize=15)
plt.legend()#图例设置
plt.show()
def lof(data, predict=None, k=12, method=1, plot=True):
import pandas as pd
# 判断是否传入测试数据,若没有传入则测试数据赋值为训练数据
try:
if predict == None:
predict = data.copy()
except Exception:
pass
predict = pd.DataFrame(predict)
# 计算 LOF 离群因子
predict = localoutlierfactor(data, predict, k)
if plot == True:
plot_lof(predict, method)
# 根据阈值划分离群点与正常点
outliers = predict[predict['local outlier factor'] > method].sort_values(by='local outlier factor')
inliers = predict[predict['local outlier factor'] <= method].sort_values(by='local outlier factor')
return outliers, inliers,predict
import numpy as np
import pandas as pd
shuju=pd.read_csv('C:\\Users\\lenovo\\Desktop\\AA.csv',sep=",")
lon = np.array(shuju["single"][:]) # x1
lat = np.array(shuju["mulit"][:]) # x2
A = list(zip(lat, lon)) # 按x1-x2匹配
a=[]
b=[]
for index,row in shuju.iterrows():
#print (row)
#a=((row[0],row[1]))
a.append(row[0])
b.append(row[1])
# 获取任务密度,取第5邻域,阈值为2(LOF大于1.3认为是离群值)
outliers1, inliers1, predict = lof(A, k=12, method = 1.3)
print (outliers1, inliers1,predict)
lof_value = predict["local outlier factor"]
#df =pd.DataFrame(shuju.index,a,b,lof_value)
index = list(shuju.index)
lofV = list(lof_value)
df=pd.DataFrame(b,a)
df['index'] =index
df['lof'] =lofV
#df.columns
df_index = shuju.index
df = df.drop('index',axis=1)
df.insert(1,'index',df_index) #insert 三个参数,插到第几列,该列列名,值
#df = pd.DataFrame([b],[a],[index],[lofV])
df.to_csv("aaa.csv")
#pd.DataFrame([[1, 'w', 3],[4, 5, 6],[1, 2, 3],[4, 5, 6]], columns=['col1','col2','col3'])
#predict.columns