# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import math
import time
UNCLASSIFIED = False
NOISE = 0
def loadDataSet(fileName, splitChar='\t'):#读取数据
dataSet = []
with open(fileName) as fr:
for line in fr.readlines():
curline = line.strip().split(splitChar)
fltline = list(map(float, curline))
dataSet.append(fltline)
return dataSet
##########################################################################################################################
def dist(a, b):#计算两个向量的距离
return math.sqrt(np.power(a - b, 2).sum())
def eps_neighbor(a, b, eps):#比较距离大小,是否在eps范围内
return dist(a, b) < eps#调用函数:计算两个向量的距离
def region_query(data, pointId, eps):#查询eps范围内的点的id
nPoints = data.shape[1]#data的维度(shape指定矩阵中维度的第二位)
seeds = []
for i in range(nPoints):#按照data矩阵的维度进行循环
if eps_neighbor(data[:, pointId], data[:, i], eps):#调用函数:比较距离大小,是否在eps范围内
seeds.append(i)
return seeds#返回数组(数组中是eps范围内的点的id)
def expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts):#能否成功分类
seeds = region_query(data, pointId, eps)#调用函数:查询eps范围内的点的id
if len(seeds) < minPts:#如果半径范围内点的个数小于最小值(minPts)
clusterResult[pointId] = NOISE#将这个点判定为噪音点
return False
else:
clusterResult[pointId] = clusterId#将聚类结果赋给结果变量
for seedId in seeds:
clusterResult[seedId] = clusterId
while len(seeds) > 0:
currentPoint = seeds[0]
queryResults = region_query(data, currentPoint, eps)#调用函数:查询eps范围内的点的id
if len(queryResults) >= minPts:#如果半径范围内点的个数大于最小值(minPts)
for i in range(len(queryResults)):
resultPoint = queryResults[i]
if clusterResult[resultPoint] == UNCLASSIFIED:#
seeds.append(resultPoint)
clusterResult[resultPoint] = clusterId
elif clusterResult[resultPoint] == NOISE:
clusterResult[resultPoint] = clusterId
seeds = seeds[1:]#从第2位开始返回结果
return True
def dbscan(data, eps, minPts):#计算点的分类
clusterId = 1
nPoints = data.shape[1]#数据的维度
clusterResult = [UNCLASSIFIED] * nPoints#建立分类结果的矩阵,矩阵中都是[false]
for pointId in range(nPoints):#为点标记所属的类
point = data[:, pointId]#取data的第一列(用到了python-numpy中的切片)
if clusterResult[pointId] == UNCLASSIFIED:#如果聚类结果为空
if expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts):#调用函数:能否成功分类
clusterId = clusterId + 1#聚类结果+1
return clusterResult, clusterId - 1
############################################################################################################################
def main():
dataSet = loadDataSet('tem-level-400.csv', splitChar = ',')#调用函数:导入数据
#print(dataSet)
dataSet = np.mat(dataSet).transpose()#数据处理
#print(dataSet)
clusters, clusterNum = dbscan(dataSet, 0.05, 3)#调用函数:计算点的分类(半径为0.05,数量为3)
if __name__ == '__main__':
start = time.clock()
main()
end = time.clock()
print('Finish all in %s' % str(end - start))
DBSCAN算法解释
最新推荐文章于 2024-05-11 20:57:59 发布