当我傻傻的用python写DBSCAN,我才突然想起来在scikit-learn中有DBSCAN,可以直接调用啊,我本来想要放弃快完成的代码,但是我想我可以发博客啊,好吧!
# -*- coding: utf-8 -*-
import pandas as pd
import math
class Point:
def __init__(self,x,y,visited=False):
self.x = x
self.y = y
self.visited = visited
def __setattr__(self,visited,value):
#super().__setattr__(visited,value)#这个在python3上是没有问题的,但是在我python2.7上就老报错,所以就用了下面这种方式来代替
#self.visited = value#这样写的话可能会造成死循环
self.__dict__[visited] = value
def getdist(self,p):
dist = (self.x - p.x)*(self.x-p.x)+(self.y-p.y)*(self.y-p.y)
return math.sqrt(dist)
def isSame(self,p):
isSame = False
if self.x == p.x and self.y == p.y :
isSame = True
return isSame
class DBSCANP:
def __init__(self,rad,minPts):
try:
fr = open('F:/Geolife Trajectories 1.3/Geolife Trajectories 1.3/Data/001/Solved.txt')
data = pd.read_csv(fr)
except IOError:
print 'error'
self.rad = rad
self.minPts = minPts
self.pointCluster = []
self.noiseCluster = []
loc_List = data.iloc[:,[1,2]]
self.locList = loc_List#这个地方也太搞笑了呢
totalPoint = []
for index in range(len(loc_List)):
tmp = loc_List.ix[index]#这个返回的是Series类型
x = tmp[0]
y = tmp[1]
point = Point(x,y)
totalPoint.append(point)
self.totalPoint = totalPoint
def recursivedbscan(self,p,parentCluster):
#p为当前点
#parent为p的父节点
if p.visited:
return
p.visited = True
for point in self.totalPoint:
#排序point与p为同一个点的可能性
if p.isSame(point):
continue
cluster = []
#计算这两个点之间的距离
dist = p.getdist(point)
if dist <=self.rad:
cluster.append(point)
#遍历完所有的点后判断是否满足成簇的条件
if len(cluster)>self.minPts:
cluster.append(p)
#把当前点也加入到聚簇中去
self.addCluster(cluster,parentCluster)
#递归的加入点
for point in cluster:
self.recursivedbscan(point,parentCluster)
def addCluster(cluster,partentCluster):
#实质就是把cluster中且parentCluster中没有的点加入到partentCluster中去
tempPoint = []#用来暂存
for point1 in cluster:
if point1 not in partentCluster:
tempPoint.append(point1)
if len(point1)!=0:
partentCluster.extend(tempPoint)
#移除噪音点
def dbscanCluster(self):
for point in self.totalPoint:
if point.visited:
continue
#先初始化空cluster
cluster = []
self.recursivedbscan(point,cluster)
if len(cluster)>self.minPts:
self.pointCluster.append(cluster)
else:
self.noiseCluster.append(cluster)
#添加完噪音点,可以移除
DBSCANP.removeNoisePoint()
def removeNoisePoint(self):
totalCluster = []
for point in self.pointCluster:
totalCluster.extend(point)
for point in totalCluster:
if point in self.noiseCluster:
totalCluster.remove(point)
def printCluster(self):
for cluster in self.pointCluster:
for point in cluster:
print "x:%s,y:%s"%(point.x,point.y)
if __name__ == '__main__':
dbscanp = DBSCANP(2,1)
dbscanp.dbscanCluster()
dbscanp.printCluster()