DBSCAN算法解释

最新推荐文章于 2024-05-11 20:57:59 发布

zhangcongchn

最新推荐文章于 2024-05-11 20:57:59 发布

阅读量829

点赞数

# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import math
import time

UNCLASSIFIED = False
NOISE = 0

def loadDataSet(fileName, splitChar='\t'):#读取数据
	dataSet = []
	with open(fileName) as fr:
		for line in fr.readlines():
			curline = line.strip().split(splitChar)
			fltline = list(map(float, curline))
			dataSet.append(fltline)
	return dataSet

##########################################################################################################################

def dist(a, b):#计算两个向量的距离
	return math.sqrt(np.power(a - b, 2).sum())


def eps_neighbor(a, b, eps):#比较距离大小，是否在eps范围内
	return dist(a, b) < eps#调用函数：计算两个向量的距离


def region_query(data, pointId, eps):#查询eps范围内的点的id
	nPoints = data.shape[1]#data的维度（shape指定矩阵中维度的第二位）
	seeds = []
	for i in range(nPoints):#按照data矩阵的维度进行循环
		if eps_neighbor(data[:, pointId], data[:, i], eps):#调用函数：比较距离大小，是否在eps范围内
			seeds.append(i)
	return seeds#返回数组（数组中是eps范围内的点的id）

def expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts):#能否成功分类
	seeds = region_query(data, pointId, eps)#调用函数:查询eps范围内的点的id
	if len(seeds) < minPts:#如果半径范围内点的个数小于最小值（minPts）
		clusterResult[pointId] = NOISE#将这个点判定为噪音点
		return False
	else:
		clusterResult[pointId] = clusterId#将聚类结果赋给结果变量
		for seedId in seeds:
			clusterResult[seedId] = clusterId
		while len(seeds) > 0:
			currentPoint = seeds[0]
			queryResults = region_query(data, currentPoint, eps)#调用函数:查询eps范围内的点的id
			if len(queryResults) >= minPts:#如果半径范围内点的个数大于最小值（minPts）
				for i in range(len(queryResults)):
					resultPoint = queryResults[i]
					if clusterResult[resultPoint] == UNCLASSIFIED:#
						seeds.append(resultPoint)
						clusterResult[resultPoint] = clusterId
					elif clusterResult[resultPoint] == NOISE:
						clusterResult[resultPoint] = clusterId
			seeds = seeds[1:]#从第2位开始返回结果
		return True

def dbscan(data, eps, minPts):#计算点的分类
	clusterId = 1
	nPoints = data.shape[1]#数据的维度
	clusterResult = [UNCLASSIFIED] * nPoints#建立分类结果的矩阵，矩阵中都是[false]
	for pointId in range(nPoints):#为点标记所属的类
		point = data[:, pointId]#取data的第一列（用到了python-numpy中的切片）
		if clusterResult[pointId] == UNCLASSIFIED:#如果聚类结果为空
			if expand_cluster(data, clusterResult, pointId, clusterId, eps, minPts):#调用函数：能否成功分类
				clusterId = clusterId + 1#聚类结果+1
	return clusterResult, clusterId - 1
############################################################################################################################
def main():
	dataSet = loadDataSet('tem-level-400.csv', splitChar = ',')#调用函数：导入数据
	#print(dataSet)
	dataSet = np.mat(dataSet).transpose()#数据处理
	#print(dataSet)
	clusters, clusterNum = dbscan(dataSet, 0.05, 3)#调用函数:计算点的分类(半径为0.05，数量为3)
	
	

	
if __name__ == '__main__':
	start = time.clock()
	main()
	end = time.clock()
	print('Finish all in %s' % str(end - start))

zhangcongchn

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
DBSCAN算法解释

# -*- coding: utf-8 -*-import numpy as npimport matplotlib.pyplot as pltimport mathimport timeUNCLASSIFIED = FalseNOISE = 0def loadDataSet(fileName, splitChar='\t'):#读取数据 dataSet = [] with ...
复制链接

扫一扫