LOF算法
全称又叫局部异常因子,是一种基于距离的异常点检测算法。
理论链接: LOF算法理论.
- 手码
由于需要搜寻最近的K个点,采用KDTree来加快效率
from scipy import spatial
# items in data_list should be unique
# K is the number of neighbors
# return LOF score
def getLOF(data_list,K,test_data):
#if data_list contains test_data ,ignore it
K+=1
#establish KDTree
tree =spatial.KDTree(data_list)
#get KNodes of test_data
KNodes,distanceList=getKNodesAndDistance(tree,K,test_data)
#get Reach Distance and Density of test
lrdistance_testData=getReachDistance(KNodes,tree,K,test_data)
lrdensity_testData=1/(lrdistance_testData/len(KNodes))
#get Density of TestNode's KNodesList
totalReachDensity=getReachDensity(KNodes,tree,K)
return (totalReachDensity/lrdensity_testData)/len(KNodes)
# Euclid distance
def getDistance(a,b):
return math.sqrt(math.pow(a[0]-b[0],2)+math.pow(a[1]-b[1],2))
def getKNodesAndDistance(kdtree,K,test_data):
#get KNodes of test_data
list_distance,list_index=kdtree.query(test_data,k=K,p=2)
KNodes = []
for temp in list_index:
KNodes.append(kdtree.data[temp])
return KNodes,list_distance
def getReachDistance(KNodes,tree,K,test_data):
totalReachDistance=0
for temp_Nodes in KNodes:
KNodes_temp_Nodes,distanceList_temp_Nodes = getKNodesAndDistance(tree,K,temp_Nodes)
reachDistance_everyKNode=getDistance(temp_Nodes,test_data)
# 可达距离指周围K领域里的点到test点的K距离和直线距离的最大值
# K距离指距离周围的最近K个点的最大距离
if(distanceList_temp_Nodes[len(distanceList_temp_Nodes)-1]>reachDistance_everyKNode):
reachDistance_everyKNode=distanceList_temp_Nodes[len(distanceList_temp_Nodes)-1]
totalReachDistance+=reachDistance_everyKNode
return totalReachDistance
def getReachDensity(KNodes,tree,K):
totalReachDensity = 0
for temp_Node in KNodes:
KNodes_temp_Node,distance = getKNodesAndDistance(tree,K,temp_Node)
reachDistacne_temp_Node=getReachDistance(KNodes_temp_Node,tree,K,temp_Node)
reachDensity_temp_Node=1/(reachDistacne_temp_Node/len(KNodes_temp_Node))
totalReachDensity+=reachDensity_temp_Node
return totalReachDensity
- 调库
from sklearn.neighbors import LocalOutlierFactor as LOF
# y得到的是-1、1列表,-1表示为异常值
model = LOF(n_neighbors=5, contamination=0.1,novelty=True,n_jobs=-1) #定义一个LOF模型,异常比例是10%
model.fit(list(set(list_fortest)))
y = model._predict(list(set(list_fortest)))