三维点云学习(4)5-DBSCNA python 复现-2-kd-_tree加速
因为在上一章DBSCAN在构建距离矩阵时,需要构建一个N*N的距离矩阵,严重占用资源,古采用kd_tree搜索进行进一步的优化,使用kd_tree 的radius NN 进行近邻矩阵的构建,大大提高运算速率
DBSCNA python 复现-1- 距离矩阵法
使用自写、scipy库、sklearn库 kd-tree DBSCAN聚类最终效果图
原图:
自写kd_tree 的radius NN
生成的聚类个数:3
dbscan time:3.233520
Process finished with exit code 0
使用scipy.spatial 库的kd_tree
其中白色的为噪点
优化前:
生成的聚类个数:4
dbscan time:19.526319
Process finished with exit code 0
自写-kd_tree优化后,速度提升大概20倍!!!
生成的聚类个数:3
dbscan time:1.043557
Process finished with exit code 0
sklearn-kd_tree优化后,使用sklearn 库更快
生成的聚类个数:5
dbscan time:0.030102
主要优化部分,代码片
构建kd_tree
n = len(data)
# 构建kd_tree
leaf_size = 4
root = kdtree.kdtree_construction(data,leaf_size=leaf_size)
#step1 初始化核心对象集合T,聚类个数k,聚类集合C, 未访问集合P
T = set() #set 集合
k = 0 #类初始化
C = [] #聚类集合
unvisited = set(range(n)) #初始化未访问集合
step2 通过判断,通过kd_tree radius NN找出所有核心点
#step2 通过判断,通过kd_tree radius NN找出所有核心点
for d in range(n):
result_set = RadiusNNResultSet(radius=eps) #进行radius NN搜索,半径为epsion
kdtree.kdtree_radius_search(root,data,result_set,data[d])
nearest_idx = result_set.radius_nn_output_index()
if len(nearest_idx) >= Minpts: #临近点数 > min_sample,加入核心点
T.add(d) #最初得核心点
获取new_core的近邻点
#kd-tree radius NN 搜索邻近
result_set = RadiusNNResultSet(radius=eps) # 进行radius NN搜索,半径为epsion
kdtree.kdtree_radius_search(root, data, result_set, data[new_core])
new_core_nearest = result_set.radius_nn_output_index() #获取new_core 得邻近点
完整代码
#DBSCAN_fast.py
# 文件功能:实现 Spectral 谱聚类 算法
from numpy import *
from numpy.random import rand
import matplotlib.pyplot as plt
from result_set import KNNResultSet, RadiusNNResultSet
import kdtree as kdtree
import time
plt.style.use('seaborn')
# matplotlib显示点云函数
def Point_Show(point,color):
x = []
y = []
point = np.asarray(point)
for i in range(len(point)):
x.append(point[i][0])
y.append(point[i][1])
plt.scatter(x, y,color=color)
#构建距离矩阵
def my_distance_Marix(data):
S = np.zeros((len(data), len(data))) # 初始化 关系矩阵 w 为 n*n的矩阵
# step1 建立关系矩阵, 每个节点都有连线,权重为距离的倒数
for i in range(len(data)): # i:行
for j in range(len(data)): # j:列
S[i][j] = np.linalg.norm(data[i] - data[j]) # 二范数计算两个点直接的距离,两个点之间的权重为之间距离的倒数
return S
# @profile
def DBSCAN(data, eps, Minpts):
"""
基于密度的点云聚类
:param d_bbox: 点与点之间的距离矩阵
:param eps: 最大搜索直径阈值
:param Minpts: 最小包含其他对象数量阈值
:return: 返回聚类结果,是一个嵌套列表,每个子列表就是这个区域的对象的序号
"""
n = len(data)
# 构建kd_tree
leaf_size = 4
root = kdtree.kdtree_construction(data,leaf_size=leaf_size)
#step1 初始化核心对象集合T,聚类个数k,聚类集合C, 未访问集合P
T = set() #set 集合
k = 0 #类初始化
C = [] #聚类集合
unvisited = set(range(n)) #初始化未访问集合
#step2 通过判断,通过kd_tree radius NN找出所有核心点
for d in range(n):
result_set = RadiusNNResultSet(radius=eps) #进行radius NN搜索,半径为epsion
kdtree.kdtree_radius_search(root,data,result_set,data[d])
nearest_idx = result_set.radius_nn_output_index()
if len(nearest_idx) >= Minpts: #临近点数 > min_sample,加入核心点
T.add(d) #最初得核心点
#step3 聚类
while len(T): #visited core ,until all core points were visited
unvisited_old = unvisited #更新为访问集合
core = list(T)[np.random.randint(0,len(T))] #从 核心点集T 中随机选取一个 核心点core
unvisited = unvisited - set([core]) #把核心点标记为 visited,从 unvisited 集合中剔除
visited = []
visited.append(core)
while len(visited):
new_core = visited[0]
#kd-tree radius NN 搜索邻近
result_set = RadiusNNResultSet(radius=eps) # 进行radius NN搜索,半径为epsion
kdtree.kdtree_radius_search(root, data, result_set, data[new_core])
new_core_nearest = result_set.radius_nn_output_index() #获取new_core 得邻近点
if len(new_core_nearest) >= Minpts:
S = unvisited & set(new_core_nearest) #当前 核心对象的nearest 与 unvisited 的交集
visited += (list(S)) #对该new core所能辐射的点,再做检测
unvisited = unvisited - S #unvisited 剔除已 visited 的点
visited.remove(new_core) #new core 已做检测,去掉new core
k += 1 #类个数
cluster = unvisited_old - unvisited #原有的 unvisited # 和去掉了 该核心对象的密度可达对象的visited就是该类的所有对象
T = T - cluster #去掉该类对象里面包含的核心对象
C.append(cluster) #把对象加入列表
print("生成的聚类个数:%d" %k)
return C,k
# 生成仿真数据
def generate_X(true_Mu, true_Var):
# 第一簇的数据
num1, mu1, var1 = 400, true_Mu[0], true_Var[0]
X1 = np.random.multivariate_normal(mu1, np.diag(var1), num1)
# 第二簇的数据
num2, mu2, var2 = 600, true_Mu[1], true_Var[1]
X2 = np.random.multivariate_normal(mu2, np.diag(var2), num2)
# 第三簇的数据
num3, mu3, var3 = 1000, true_Mu[2], true_Var[2]
X3 = np.random.multivariate_normal(mu3, np.diag(var3), num3)
# 合并在一起
X = np.vstack((X1, X2, X3))
# 显示数据
plt.figure(figsize=(10, 8))
plt.axis([-10, 15, -5, 15])
plt.scatter(X1[:, 0], X1[:, 1], s=5)
plt.scatter(X2[:, 0], X2[:, 1], s=5)
plt.scatter(X3[:, 0], X3[:, 1], s=5)
#plt.show()
return X
if __name__ == '__main__':
# 生成数据
true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
true_Var = [[1, 3], [2, 2], [6, 2]]
X = generate_X(true_Mu, true_Var)
begin_t = time.time()
index,k = DBSCAN(X,eps=0.5,Minpts=15)
dbscan_time = time.time() - begin_t
print("dbscan time:%f" %dbscan_time)
cluster = [[] for i in range(k)]
for i in range(k):
cluster[i] = [X[j] for j in index[i]]
Point_Show(cluster[0],color="red")
Point_Show(cluster[1], color="orange")
Point_Show(cluster[2],color="blue")
plt.show()
#kdtree.py
import random
import math
import numpy as np
from result_set import KNNResultSet, RadiusNNResultSet
class Node:
def __init__(self, axis, value, left, right, point_indices):
self.axis = axis
self.value = value
self.left = left
self.right = right
self.point_indices = point_indices
def is_leaf(self):
if self.value is None:
return True
else:
return False
def __str__(self):
output = ''
output += 'axis %d, ' % self.axis
if self.value is None:
output += 'split value: leaf, '
else:
output += 'split value: %.2f, ' % self.value
output += 'point_indices: '
output += str(self.point_indices.tolist())
return output
def sort_key_by_vale(key, value):
assert key.shape == value.shape #assert 断言操作,用于判断一个表达式,在表达式条件为false的时候触发异常
assert len(key.shape) == 1 #numpy是多维数组
sorted_idx = np.argsort(value) #对value值进行排序
key_sorted = key[sorted_idx]
value_sorted = value[sorted_idx] #进行升序排序
return key_sorted, value_sorted
def axis_round_robin(axis, dim): #用于轴的轮换
if axis == dim-1:
return 0
else:
return axis + 1
def kdtree_recursive_build(root, db, point_indices, axis, leaf_size): #kd树的建立
"""
:param root:
:param db: NxD
:param db_sorted_idx_inv: NxD
:param point_idx: M
:param axis: scalar
:param leaf_size: scalar
:return:
"""
if root is None:
root = Node(axis, None, None, None, point_indices) #实例化Node
# determine whether to split into left and right
if len(point_indices) > leaf_size: #判断是否需要进行分割
# --- get the split position ---
point_indices_sorted, _ = sort_key_by_vale(point_indices, db[point_indices, axis]) #对点进行排列,dp存储信息
middle_left_idx = math.ceil(point_indices_sorted.shape[0] / 2) - 1 #分一半
middle_left_point_idx = point_indices_sorted[middle_left_idx] #左边界点
middle_left_point_value = db[middle_left_point_idx, axis]
middle_right_idx = middle_left_idx + 1
middle_right_point_idx = point_indices_sorted[middle_right_idx]
middle_right_point_value = db[middle_right_point_idx, axis] #右边界点
root.value = (middle_left_point_value + middle_right_point_value) * 0.5 #取中值为节点值
# === get the split position ===
root.left = kdtree_recursive_build(root.left,
db,
point_indices_sorted[0:middle_right_idx],
axis_round_robin(axis, dim=db.shape[1]),
leaf_size) #对对应的轴值进行排序
root.right = kdtree_recursive_build(root.right,
db,
point_indices_sorted[middle_right_idx:],
axis_round_robin(axis, dim=db.shape[1]),
leaf_size) #对对应的轴值进行排序
return root
def traverse_kdtree(root: Node, depth, max_depth): #计算kdtree的深度
depth[0] += 1
if max_depth[0] < depth[0]:
max_depth[0] = depth[0]
if root.is_leaf(): #打印最后的叶子节点
print(root)
else:
traverse_kdtree(root.left, depth, max_depth) #累加计算深度
traverse_kdtree(root.right, depth, max_depth)
depth[0] -= 1
def kdtree_construction(db_np, leaf_size):
N, dim = db_np.shape[0], db_np.shape[1]
# build kd_tree recursively
root = None
root = kdtree_recursive_build(root,
db_np,
np.arange(N),
axis=0,
leaf_size=leaf_size)
return root
def kdtree_knn_search(root: Node, db: np.ndarray, result_set: KNNResultSet, query: np.ndarray): #KNNResultSet 继承二叉树的结果集
if root is None:
return False
if root.is_leaf():
# compare the contents of a leaf
leaf_points = db[root.point_indices, :]
diff = np.linalg.norm(np.expand_dims(query, 0) - leaf_points, axis=1)
for i in range(diff.shape[0]):
result_set.add_point(diff[i], root.point_indices[i])
return False
if query[root.axis] <= root.value: #如果 q[axis] inside the partition 如果查询点在根节点的左边,一定要查找左边
kdtree_knn_search(root.left, db, result_set, query)
if math.fabs(query[root.axis] - root.value) < result_set.worstDist(): #如果目标点离轴虚线的距离小于worst_dist 继续搜寻节点的右边
kdtree_knn_search(root.right, db, result_set, query)
else:
kdtree_knn_search(root.right, db, result_set, query)
if math.fabs(query[root.axis] - root.value) < result_set.worstDist():
kdtree_knn_search(root.left, db, result_set, query)
return False
def kdtree_radius_search(root: Node, db: np.ndarray, result_set: RadiusNNResultSet, query: np.ndarray):
if root is None:
return False
if root.is_leaf():
# compare the contents of a leaf
leaf_points = db[root.point_indices, :]
diff = np.linalg.norm(np.expand_dims(query, 0) - leaf_points, axis=1) #取行的差值,暴力搜索
for i in range(diff.shape[0]):
result_set.add_point(diff[i], root.point_indices[i])
return False
if query[root.axis] <= root.value:
kdtree_radius_search(root.left, db, result_set, query)
if math.fabs(query[root.axis] - root.value) < result_set.worstDist():
kdtree_radius_search(root.right, db, result_set, query)
else:
kdtree_radius_search(root.right, db, result_set, query)
if math.fabs(query[root.axis] - root.value) < result_set.worstDist():
kdtree_radius_search(root.left, db, result_set, query)
return False
def main():
# configuration
db_size = 64
dim = 3 #三维
leaf_size = 4
k = 1 #一个点
db_np = np.random.rand(db_size, dim)
root = kdtree_construction(db_np, leaf_size=leaf_size)
depth = [0]
max_depth = [0]
traverse_kdtree(root, depth, max_depth)
print("tree max depth: %d" % max_depth[0])
query = np.asarray([0, 0, 0])
result_set = KNNResultSet(capacity=k)
kdtree_knn_search(root, db_np, result_set, query)
print(result_set)
print(db_np)
#
# diff = np.linalg.norm(np.expand_dims(query, 0) - db_np, axis=1)
# nn_idx = np.argsort(diff)
# nn_dist = diff[nn_idx]
# print(nn_idx[0:k])
# print(nn_dist[0:k])
#
#
# print("Radius search:")
# query = np.asarray([0, 0, 0])
# result_set = RadiusNNResultSet(radius = 0.5)
# radius_search(root, db_np, result_set, query)
# print(result_set)
if __name__ == '__main__':
main()
#result_set.py
import copy
class DistIndex:
def __init__(self, distance, index):
self.distance = distance
self.index = index
def __lt__(self, other):
return self.distance < other.distance
class KNNResultSet:
def __init__(self, capacity):
self.capacity = capacity
self.count = 0
self.worst_dist = 1e10
self.dist_index_list = []
self.output_index = []
for i in range(capacity):
self.dist_index_list.append(DistIndex(self.worst_dist, 0))
self.comparison_counter = 0
def size(self):
return self.count
def full(self):
return self.count == self.capacity
def worstDist(self):
return self.worst_dist
def add_point(self, dist, index):
self.comparison_counter += 1
if dist > self.worst_dist:
return
if self.count < self.capacity:
self.count += 1
i = self.count - 1
while i > 0:
if self.dist_index_list[i - 1].distance > dist:
self.dist_index_list[i] = copy.deepcopy(self.dist_index_list[i - 1])
i -= 1
else:
break
self.dist_index_list[i].distance = dist
self.dist_index_list[i].index = index
self.worst_dist = self.dist_index_list[self.capacity - 1].distance
def __str__(self):
output = ''
for i, dist_index in enumerate(self.dist_index_list):
output += '%d - %.2f\n' % (dist_index.index, dist_index.distance)
output += 'In total %d comparison operations.' % self.comparison_counter
return output
def knn_output_index(self):
output = ''
for i, dist_index in enumerate(self.dist_index_list):
output += '%d - %.2f\n' % (dist_index.index, dist_index.distance)
self.output_index.append(dist_index.index)
output += 'In total %d comparison operations.' % self.comparison_counter
return self.output_index
class RadiusNNResultSet:
def __init__(self, radius):
self.radius = radius
self.count = 0
self.worst_dist = radius
self.dist_index_list = []
self.output_index = []
self.comparison_counter = 0
def size(self):
return self.count
def worstDist(self):
return self.radius
def add_point(self, dist, index):
self.comparison_counter += 1
if dist > self.radius:
return
self.count += 1
self.dist_index_list.append(DistIndex(dist, index))
def __str__(self):
self.dist_index_list.sort()
output = ''
for i, dist_index in enumerate(self.dist_index_list):
output += '%d - %.2f\n' % (dist_index.index, dist_index.distance)
output += 'In total %d neighbors within %f.\nThere are %d comparison operations.' \
% (self.count, self.radius, self.comparison_counter)
return output
def radius_nn_output_index(self):
output = ''
for i, dist_index in enumerate(self.dist_index_list):
output += '%d - %.2f\n' % (dist_index.index, dist_index.distance)
self.output_index.append(dist_index.index)
output += 'In total %d comparison operations.' % self.comparison_counter
return self.output_index
调用sklearn的kd_tree库进行 DBSCAN的优化
全部代码
#DBCAN_fast.py
# 文件功能:实现 Spectral 谱聚类 算法
import numpy as np
from numpy import *
import scipy
import pylab
import random, math
from numpy.random import rand
from numpy import square, sqrt
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import multivariate_normal
from result_set import KNNResultSet, RadiusNNResultSet
from sklearn.cluster import KMeans
import kdtree as kdtree
import time
#from scipy.spatial import KDTree
from sklearn.neighbors import KDTree # KDTree 进行搜索
import copy
plt.style.use('seaborn')
# matplotlib显示点云函数
def Point_Show(point,point_index):
def colormap(c, num_clusters):
if c == -1:
color = [1] * 3
# surrouding object:
else:
color = [0] * 3
color[c % 3] = c / num_clusters
return color
x = []
y = []
num_clusters = max(point_index) + 1
point = np.asarray(point)
for i in range(len(point)):
x.append(point[i][0])
y.append(point[i][1])
plt.scatter(x, y,color=[colormap(c,num_clusters) for c in point_index])
plt.show()
#构建距离矩阵
def my_distance_Marix(data):
S = np.zeros((len(data), len(data))) # 初始化 关系矩阵 w 为 n*n的矩阵
# step1 建立关系矩阵, 每个节点都有连线,权重为距离的倒数
for i in range(len(data)): # i:行
for j in range(len(data)): # j:列
S[i][j] = np.linalg.norm(data[i] - data[j]) # 二范数计算两个点直接的距离,两个点之间的权重为之间距离的倒数
return S
# @profile
def DBSCAN(data, eps, Minpts):
"""
基于密度的点云聚类
:param d_bbox: 点与点之间的距离矩阵
:param eps: 最大搜索直径阈值
:param Minpts: 最小包含其他对象数量阈值
:return: 返回聚类结果,是一个嵌套列表,每个子列表就是这个区域的对象的序号
"""
n = len(data)
# 构建kd_tree
leaf_size = 4
tree = KDTree(data,leaf_size)
#step1 初始化核心对象集合T,聚类个数k,聚类集合C, 未访问集合P
T = set() #set 集合
k = 0 #第k类
cluster_index = np.zeros(n,dtype=int) #聚类集合
unvisited = set(range(n)) #初始化未访问集合
#step2 通过判断,通过kd_tree radius NN找出所有核心点
nearest_idx = tree.query_radius(data, eps) # 进行radius NN搜索,半径为epsion,所有点的最临近点储存在 nearest_idx中
for d in range(n):
if len(nearest_idx[d]) >= Minpts: #临近点数 > min_sample,加入核心点
T.add(d) #最初得核心点
#step3 聚类
while len(T): #visited core ,until all core points were visited
unvisited_old = unvisited #更新为访问集合
core = list(T)[np.random.randint(0,len(T))] #从 核心点集T 中随机选取一个 核心点core
unvisited = unvisited - set([core]) #把核心点标记为 visited,从 unvisited 集合中剔除
visited = []
visited.append(core)
while len(visited):
new_core = visited[0]
#kd-tree radius NN 搜索邻近
if new_core in T: #如果当前搜索点是核心点
S = unvisited & set(nearest_idx[new_core]) #当前 核心对象的nearest 与 unvisited 的交集
visited += (list(S)) #对该new core所能辐射的点,再做检测
unvisited = unvisited - S #unvisited 剔除已 visited 的点
visited.remove(new_core) #new core 已做检测,去掉new core
cluster = unvisited_old - unvisited #原有的 unvisited # 和去掉了 该核心对象的密度可达对象的visited就是该类的所有对象
T = T - cluster #去掉该类对象里面包含的核心对象,差集
cluster_index[list(cluster)] = k
k += 1 #类个数
noise_cluster = unvisited
cluster_index[list(noise_cluster)] = -1 #噪声归类为 1
print(cluster_index)
print("生成的聚类个数:%d" %k)
return cluster_index
# 生成仿真数据
def generate_X(true_Mu, true_Var):
# 第一簇的数据
num1, mu1, var1 = 400, true_Mu[0], true_Var[0]
X1 = np.random.multivariate_normal(mu1, np.diag(var1), num1)
# 第二簇的数据
num2, mu2, var2 = 600, true_Mu[1], true_Var[1]
X2 = np.random.multivariate_normal(mu2, np.diag(var2), num2)
# 第三簇的数据
num3, mu3, var3 = 1000, true_Mu[2], true_Var[2]
X3 = np.random.multivariate_normal(mu3, np.diag(var3), num3)
# 合并在一起
X = np.vstack((X1, X2, X3))
# 显示数据
plt.figure(figsize=(10, 8))
plt.axis([-10, 15, -5, 15])
plt.scatter(X1[:, 0], X1[:, 1], s=5)
plt.scatter(X2[:, 0], X2[:, 1], s=5)
plt.scatter(X3[:, 0], X3[:, 1], s=5)
#plt.show()
return X
if __name__ == '__main__':
# 生成数据
true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
true_Var = [[1, 3], [2, 2], [6, 2]]
X = generate_X(true_Mu, true_Var)
cluster_index = DBSCAN(X,eps=0.5,Minpts=15)
Point_Show(X,cluster_index)