三维点云学习(4)5-DBSCNA python 复现-2-kd-_tree加速

三维点云学习(4)5-DBSCNA python 复现-2-kd-_tree加速

因为在上一章DBSCAN在构建距离矩阵时,需要构建一个N*N的距离矩阵,严重占用资源,古采用kd_tree搜索进行进一步的优化,使用kd_tree 的radius NN 进行近邻矩阵的构建,大大提高运算速率
DBSCNA python 复现-1- 距离矩阵法

使用自写、scipy库、sklearn库 kd-tree DBSCAN聚类最终效果图

原图:
在这里插入图片描述

自写kd_tree 的radius NN

在这里插入图片描述

生成的聚类个数:3
dbscan time:3.233520

Process finished with exit code 0

使用scipy.spatial 库的kd_tree

其中白色的为噪点
在这里插入图片描述
优化前:

生成的聚类个数:4
dbscan time:19.526319

Process finished with exit code 0

自写-kd_tree优化后,速度提升大概20倍!!!

生成的聚类个数:3
dbscan time:1.043557

Process finished with exit code 0

sklearn-kd_tree优化后,使用sklearn 库更快

生成的聚类个数:5
dbscan time:0.030102

主要优化部分,代码片

构建kd_tree

    n = len(data)
    # 构建kd_tree
    leaf_size = 4
    root = kdtree.kdtree_construction(data,leaf_size=leaf_size)
    #step1 初始化核心对象集合T,聚类个数k,聚类集合C, 未访问集合P
    T = set()    #set 集合
    k = 0        #类初始化
    C = []       #聚类集合
    unvisited = set(range(n))   #初始化未访问集合

step2 通过判断,通过kd_tree radius NN找出所有核心点

    #step2 通过判断,通过kd_tree radius NN找出所有核心点
    for d in range(n):
        result_set = RadiusNNResultSet(radius=eps)  #进行radius NN搜索,半径为epsion
        kdtree.kdtree_radius_search(root,data,result_set,data[d])
        nearest_idx = result_set.radius_nn_output_index()
        if len(nearest_idx) >= Minpts:     #临近点数 > min_sample,加入核心点
            T.add(d)    #最初得核心点

获取new_core的近邻点

            #kd-tree radius NN 搜索邻近
            result_set = RadiusNNResultSet(radius=eps)  # 进行radius NN搜索,半径为epsion
            kdtree.kdtree_radius_search(root, data, result_set, data[new_core])
            new_core_nearest = result_set.radius_nn_output_index()   #获取new_core 得邻近点

完整代码

#DBSCAN_fast.py
# 文件功能:实现 Spectral 谱聚类 算法

from numpy import *
from numpy.random import rand
import matplotlib.pyplot as plt
from result_set import KNNResultSet, RadiusNNResultSet
import  kdtree as kdtree
import  time



plt.style.use('seaborn')


# matplotlib显示点云函数
def Point_Show(point,color):
    x = []
    y = []
    point = np.asarray(point)
    for i in range(len(point)):
        x.append(point[i][0])
        y.append(point[i][1])
    plt.scatter(x, y,color=color)


#构建距离矩阵
def my_distance_Marix(data):
    S = np.zeros((len(data), len(data)))  # 初始化 关系矩阵 w 为 n*n的矩阵
    # step1 建立关系矩阵, 每个节点都有连线,权重为距离的倒数
    for i in range(len(data)):  # i:行
        for j in range(len(data)):  # j:列
                S[i][j] = np.linalg.norm(data[i] - data[j])  # 二范数计算两个点直接的距离,两个点之间的权重为之间距离的倒数
    return S

# @profile
def DBSCAN(data, eps, Minpts):
    """
    基于密度的点云聚类
    :param d_bbox: 点与点之间的距离矩阵
    :param eps:  最大搜索直径阈值
    :param Minpts:  最小包含其他对象数量阈值
    :return: 返回聚类结果,是一个嵌套列表,每个子列表就是这个区域的对象的序号
    """
    n = len(data)
    # 构建kd_tree
    leaf_size = 4
    root = kdtree.kdtree_construction(data,leaf_size=leaf_size)
    #step1 初始化核心对象集合T,聚类个数k,聚类集合C, 未访问集合P
    T = set()    #set 集合
    k = 0        #类初始化
    C = []       #聚类集合
    unvisited = set(range(n))   #初始化未访问集合
    #step2 通过判断,通过kd_tree radius NN找出所有核心点
    for d in range(n):
        result_set = RadiusNNResultSet(radius=eps)  #进行radius NN搜索,半径为epsion
        kdtree.kdtree_radius_search(root,data,result_set,data[d])
        nearest_idx = result_set.radius_nn_output_index()
        if len(nearest_idx) >= Minpts:     #临近点数 > min_sample,加入核心点
            T.add(d)    #最初得核心点
    #step3 聚类
    while len(T):     #visited core ,until all core points were visited
        unvisited_old = unvisited     #更新为访问集合
        core = list(T)[np.random.randint(0,len(T))]    #从 核心点集T 中随机选取一个 核心点core
        unvisited = unvisited - set([core])      #把核心点标记为 visited,从 unvisited 集合中剔除
        visited = []
        visited.append(core)

        while len(visited):
            new_core = visited[0]
            #kd-tree radius NN 搜索邻近
            result_set = RadiusNNResultSet(radius=eps)  # 进行radius NN搜索,半径为epsion
            kdtree.kdtree_radius_search(root, data, result_set, data[new_core])
            new_core_nearest = result_set.radius_nn_output_index()   #获取new_core 得邻近点
            if len(new_core_nearest) >= Minpts:
                S = unvisited & set(new_core_nearest)    #当前 核心对象的nearest 与 unvisited 的交集
                visited +=  (list(S))                     #对该new core所能辐射的点,再做检测
                unvisited = unvisited - S          #unvisited 剔除已 visited 的点
            visited.remove(new_core)                     #new core 已做检测,去掉new core

        k += 1   #类个数
        cluster = unvisited_old -  unvisited    #原有的 unvisited # 和去掉了 该核心对象的密度可达对象的visited就是该类的所有对象
        T = T - cluster  #去掉该类对象里面包含的核心对象
        C.append(cluster)  #把对象加入列表
    print("生成的聚类个数:%d" %k)
    return C,k
# 生成仿真数据
def generate_X(true_Mu, true_Var):
    # 第一簇的数据
    num1, mu1, var1 = 400, true_Mu[0], true_Var[0]
    X1 = np.random.multivariate_normal(mu1, np.diag(var1), num1)
    # 第二簇的数据
    num2, mu2, var2 = 600, true_Mu[1], true_Var[1]
    X2 = np.random.multivariate_normal(mu2, np.diag(var2), num2)
    # 第三簇的数据
    num3, mu3, var3 = 1000, true_Mu[2], true_Var[2]
    X3 = np.random.multivariate_normal(mu3, np.diag(var3), num3)
    # 合并在一起
    X = np.vstack((X1, X2, X3))
    # 显示数据
    plt.figure(figsize=(10, 8))
    plt.axis([-10, 15, -5, 15])
    plt.scatter(X1[:, 0], X1[:, 1], s=5)
    plt.scatter(X2[:, 0], X2[:, 1], s=5)
    plt.scatter(X3[:, 0], X3[:, 1], s=5)
    #plt.show()
    return X


if __name__ == '__main__':
    # 生成数据
    true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
    true_Var = [[1, 3], [2, 2], [6, 2]]
    X = generate_X(true_Mu, true_Var)

    begin_t = time.time()
    index,k = DBSCAN(X,eps=0.5,Minpts=15)
    dbscan_time = time.time() - begin_t
    print("dbscan time:%f" %dbscan_time)
    cluster = [[] for i in range(k)]
    for i in range(k):
        cluster[i] = [X[j] for j in index[i]]

    Point_Show(cluster[0],color="red")
    Point_Show(cluster[1], color="orange")
    Point_Show(cluster[2],color="blue")
    plt.show()
#kdtree.py
import random
import math
import numpy as np

from result_set import KNNResultSet, RadiusNNResultSet


class Node:
    def __init__(self, axis, value, left, right, point_indices):
        self.axis = axis
        self.value = value
        self.left = left
        self.right = right
        self.point_indices = point_indices

    def is_leaf(self):
        if self.value is None:
            return True
        else:
            return False

    def __str__(self):
        output = ''
        output += 'axis %d, ' % self.axis
        if self.value is None:
            output += 'split value: leaf, '
        else:
            output += 'split value: %.2f, ' % self.value
        output += 'point_indices: '
        output += str(self.point_indices.tolist())
        return output


def sort_key_by_vale(key, value):
    assert key.shape == value.shape       #assert 断言操作,用于判断一个表达式,在表达式条件为false的时候触发异常
    assert len(key.shape) == 1            #numpy是多维数组
    sorted_idx = np.argsort(value)        #对value值进行排序
    key_sorted = key[sorted_idx]
    value_sorted = value[sorted_idx]      #进行升序排序
    return key_sorted, value_sorted


def axis_round_robin(axis, dim):         #用于轴的轮换
    if axis == dim-1:
        return 0
    else:
        return axis + 1


def kdtree_recursive_build(root, db, point_indices, axis, leaf_size):    #kd树的建立
    """
    :param root:
    :param db: NxD
    :param db_sorted_idx_inv: NxD
    :param point_idx: M
    :param axis: scalar
    :param leaf_size: scalar
    :return:
    """
    if root is None:
        root = Node(axis, None, None, None, point_indices)           #实例化Node

    # determine whether to split into left and right
    if len(point_indices) > leaf_size:                              #判断是否需要进行分割
        # --- get the split position ---
        point_indices_sorted, _ = sort_key_by_vale(point_indices, db[point_indices, axis])  #对点进行排列,dp存储信息
        middle_left_idx = math.ceil(point_indices_sorted.shape[0] / 2) - 1     #分一半
        middle_left_point_idx = point_indices_sorted[middle_left_idx]          #左边界点
        middle_left_point_value = db[middle_left_point_idx, axis]

        middle_right_idx = middle_left_idx + 1
        middle_right_point_idx = point_indices_sorted[middle_right_idx]
        middle_right_point_value = db[middle_right_point_idx, axis]           #右边界点

        root.value = (middle_left_point_value + middle_right_point_value) * 0.5    #取中值为节点值
        # === get the split position ===
        root.left = kdtree_recursive_build(root.left,
                                           db,
                                           point_indices_sorted[0:middle_right_idx],
                                           axis_round_robin(axis, dim=db.shape[1]),
                                           leaf_size)                                  #对对应的轴值进行排序
        root.right = kdtree_recursive_build(root.right,
                                           db,
                                           point_indices_sorted[middle_right_idx:],
                                           axis_round_robin(axis, dim=db.shape[1]),
                                           leaf_size)                                  #对对应的轴值进行排序
    return root


def traverse_kdtree(root: Node, depth, max_depth):      #计算kdtree的深度
    depth[0] += 1
    if max_depth[0] < depth[0]:
        max_depth[0] = depth[0]

    if root.is_leaf():                                 #打印最后的叶子节点
        print(root)
    else:
        traverse_kdtree(root.left, depth, max_depth)    #累加计算深度
        traverse_kdtree(root.right, depth, max_depth)

    depth[0] -= 1


def kdtree_construction(db_np, leaf_size):
    N, dim = db_np.shape[0], db_np.shape[1]

    # build kd_tree recursively
    root = None
    root = kdtree_recursive_build(root,
                                  db_np,
                                  np.arange(N),
                                  axis=0,
                                  leaf_size=leaf_size)
    return root


def kdtree_knn_search(root: Node, db: np.ndarray, result_set: KNNResultSet, query: np.ndarray):   #KNNResultSet 继承二叉树的结果集
    if root is None:
        return False

    if root.is_leaf():
        # compare the contents of a leaf
        leaf_points = db[root.point_indices, :]
        diff = np.linalg.norm(np.expand_dims(query, 0) - leaf_points, axis=1)
        for i in range(diff.shape[0]):
            result_set.add_point(diff[i], root.point_indices[i])
        return False

    if query[root.axis] <= root.value:          #如果 q[axis] inside the partition   如果查询点在根节点的左边,一定要查找左边
        kdtree_knn_search(root.left, db, result_set, query)
        if math.fabs(query[root.axis] - root.value) < result_set.worstDist():   #如果目标点离轴虚线的距离小于worst_dist 继续搜寻节点的右边
            kdtree_knn_search(root.right, db, result_set, query)
    else:
        kdtree_knn_search(root.right, db, result_set, query)
        if math.fabs(query[root.axis] - root.value) < result_set.worstDist():
            kdtree_knn_search(root.left, db, result_set, query)

    return False


def kdtree_radius_search(root: Node, db: np.ndarray, result_set: RadiusNNResultSet, query: np.ndarray):
    if root is None:
        return False

    if root.is_leaf():
        # compare the contents of a leaf
        leaf_points = db[root.point_indices, :]
        diff = np.linalg.norm(np.expand_dims(query, 0) - leaf_points, axis=1)             #取行的差值,暴力搜索
        for i in range(diff.shape[0]):
            result_set.add_point(diff[i], root.point_indices[i])
        return False

    if query[root.axis] <= root.value:
        kdtree_radius_search(root.left, db, result_set, query)
        if math.fabs(query[root.axis] - root.value) < result_set.worstDist():
            kdtree_radius_search(root.right, db, result_set, query)
    else:
        kdtree_radius_search(root.right, db, result_set, query)
        if math.fabs(query[root.axis] - root.value) < result_set.worstDist():
            kdtree_radius_search(root.left, db, result_set, query)

    return False



def main():
    # configuration
    db_size = 64
    dim = 3                #三维
    leaf_size = 4
    k = 1                  #一个点

    db_np = np.random.rand(db_size, dim)

    root = kdtree_construction(db_np, leaf_size=leaf_size)

    depth = [0]
    max_depth = [0]
    traverse_kdtree(root, depth, max_depth)
    print("tree max depth: %d" % max_depth[0])
    query = np.asarray([0, 0, 0])
    result_set = KNNResultSet(capacity=k)
    kdtree_knn_search(root, db_np, result_set, query)

    print(result_set)
    print(db_np)
    #
    # diff = np.linalg.norm(np.expand_dims(query, 0) - db_np, axis=1)
    # nn_idx = np.argsort(diff)
    # nn_dist = diff[nn_idx]
    # print(nn_idx[0:k])
    # print(nn_dist[0:k])
    #
    #
    # print("Radius search:")
    # query = np.asarray([0, 0, 0])
    # result_set = RadiusNNResultSet(radius = 0.5)
    # radius_search(root, db_np, result_set, query)
    # print(result_set)


if __name__ == '__main__':
    main()
#result_set.py
import copy


class DistIndex:
    def __init__(self, distance, index):
        self.distance = distance
        self.index = index

    def __lt__(self, other):
        return self.distance < other.distance


class KNNResultSet:
    def __init__(self, capacity):
        self.capacity = capacity
        self.count = 0
        self.worst_dist = 1e10
        self.dist_index_list = []
        self.output_index = []
        for i in range(capacity):
            self.dist_index_list.append(DistIndex(self.worst_dist, 0))

        self.comparison_counter = 0

    def size(self):
        return self.count

    def full(self):
        return self.count == self.capacity

    def worstDist(self):
        return self.worst_dist

    def add_point(self, dist, index):
        self.comparison_counter += 1
        if dist > self.worst_dist:
            return

        if self.count < self.capacity:
            self.count += 1

        i = self.count - 1
        while i > 0:
            if self.dist_index_list[i - 1].distance > dist:
                self.dist_index_list[i] = copy.deepcopy(self.dist_index_list[i - 1])
                i -= 1
            else:
                break

        self.dist_index_list[i].distance = dist
        self.dist_index_list[i].index = index
        self.worst_dist = self.dist_index_list[self.capacity - 1].distance

    def __str__(self):
        output = ''
        for i, dist_index in enumerate(self.dist_index_list):
            output += '%d - %.2f\n' % (dist_index.index, dist_index.distance)
        output += 'In total %d comparison operations.' % self.comparison_counter
        return output

    def knn_output_index(self):
        output = ''
        for i, dist_index in enumerate(self.dist_index_list):
            output += '%d - %.2f\n' % (dist_index.index, dist_index.distance)
            self.output_index.append(dist_index.index)
        output += 'In total %d comparison operations.' % self.comparison_counter
        return  self.output_index


class RadiusNNResultSet:
    def __init__(self, radius):
        self.radius = radius
        self.count = 0
        self.worst_dist = radius
        self.dist_index_list = []
        self.output_index = []
        self.comparison_counter = 0

    def size(self):
        return self.count

    def worstDist(self):
        return self.radius

    def add_point(self, dist, index):
        self.comparison_counter += 1
        if dist > self.radius:
            return

        self.count += 1
        self.dist_index_list.append(DistIndex(dist, index))

    def __str__(self):
        self.dist_index_list.sort()
        output = ''
        for i, dist_index in enumerate(self.dist_index_list):
            output += '%d - %.2f\n' % (dist_index.index, dist_index.distance)
        output += 'In total %d neighbors within %f.\nThere are %d comparison operations.' \
                  % (self.count, self.radius, self.comparison_counter)
        return output

    def radius_nn_output_index(self):
        output = ''
        for i, dist_index in enumerate(self.dist_index_list):
            output += '%d - %.2f\n' % (dist_index.index, dist_index.distance)
            self.output_index.append(dist_index.index)
        output += 'In total %d comparison operations.' % self.comparison_counter
        return  self.output_index

调用sklearn的kd_tree库进行 DBSCAN的优化

全部代码

#DBCAN_fast.py
# 文件功能:实现 Spectral 谱聚类 算法

import numpy as np
from numpy import *
import scipy
import pylab
import random, math
from numpy.random import rand
from numpy import square, sqrt
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import multivariate_normal
from result_set import KNNResultSet, RadiusNNResultSet
from sklearn.cluster import KMeans
import  kdtree as kdtree
import  time
#from scipy.spatial import KDTree
from sklearn.neighbors import KDTree # KDTree 进行搜索
import copy


plt.style.use('seaborn')


# matplotlib显示点云函数
def Point_Show(point,point_index):

    def colormap(c, num_clusters):
        if c == -1:
            color = [1] * 3
        # surrouding object:
        else:
            color = [0] * 3
            color[c % 3] = c / num_clusters
        return color

    x = []
    y = []
    num_clusters = max(point_index) + 1
    point = np.asarray(point)
    for i in range(len(point)):
        x.append(point[i][0])
        y.append(point[i][1])

    plt.scatter(x, y,color=[colormap(c,num_clusters) for c in point_index])
    plt.show()


#构建距离矩阵
def my_distance_Marix(data):
    S = np.zeros((len(data), len(data)))  # 初始化 关系矩阵 w 为 n*n的矩阵
    # step1 建立关系矩阵, 每个节点都有连线,权重为距离的倒数
    for i in range(len(data)):  # i:行
        for j in range(len(data)):  # j:列
                S[i][j] = np.linalg.norm(data[i] - data[j])  # 二范数计算两个点直接的距离,两个点之间的权重为之间距离的倒数
    return S

# @profile
def DBSCAN(data, eps, Minpts):
    """
    基于密度的点云聚类
    :param d_bbox: 点与点之间的距离矩阵
    :param eps:  最大搜索直径阈值
    :param Minpts:  最小包含其他对象数量阈值
    :return: 返回聚类结果,是一个嵌套列表,每个子列表就是这个区域的对象的序号
    """
    n = len(data)
    # 构建kd_tree
    leaf_size = 4
    tree = KDTree(data,leaf_size)
    #step1 初始化核心对象集合T,聚类个数k,聚类集合C, 未访问集合P
    T = set()    #set 集合
    k = 0        #第k类
    cluster_index = np.zeros(n,dtype=int)      #聚类集合
    unvisited = set(range(n))   #初始化未访问集合
    #step2 通过判断,通过kd_tree radius NN找出所有核心点
    nearest_idx = tree.query_radius(data, eps)  # 进行radius NN搜索,半径为epsion,所有点的最临近点储存在 nearest_idx中
    for d in range(n):
        if len(nearest_idx[d]) >= Minpts:     #临近点数 > min_sample,加入核心点
            T.add(d)    #最初得核心点
    #step3 聚类
    while len(T):     #visited core ,until all core points were visited
        unvisited_old = unvisited     #更新为访问集合
        core = list(T)[np.random.randint(0,len(T))]    #从 核心点集T 中随机选取一个 核心点core
        unvisited = unvisited - set([core])      #把核心点标记为 visited,从 unvisited 集合中剔除
        visited = []
        visited.append(core)

        while len(visited):
            new_core = visited[0]
            #kd-tree radius NN 搜索邻近
            if new_core in T:     #如果当前搜索点是核心点
                S = unvisited & set(nearest_idx[new_core])    #当前 核心对象的nearest 与 unvisited 的交集
                visited +=  (list(S))                     #对该new core所能辐射的点,再做检测
                unvisited = unvisited - S          #unvisited 剔除已 visited 的点
            visited.remove(new_core)                     #new core 已做检测,去掉new core

        cluster = unvisited_old -  unvisited    #原有的 unvisited # 和去掉了 该核心对象的密度可达对象的visited就是该类的所有对象
        T = T - cluster  #去掉该类对象里面包含的核心对象,差集
        cluster_index[list(cluster)] = k
        k += 1   #类个数
    noise_cluster = unvisited
    cluster_index[list(noise_cluster)] = -1    #噪声归类为 1
    print(cluster_index)
    print("生成的聚类个数:%d" %k)
    return cluster_index
# 生成仿真数据
def generate_X(true_Mu, true_Var):
    # 第一簇的数据
    num1, mu1, var1 = 400, true_Mu[0], true_Var[0]
    X1 = np.random.multivariate_normal(mu1, np.diag(var1), num1)
    # 第二簇的数据
    num2, mu2, var2 = 600, true_Mu[1], true_Var[1]
    X2 = np.random.multivariate_normal(mu2, np.diag(var2), num2)
    # 第三簇的数据
    num3, mu3, var3 = 1000, true_Mu[2], true_Var[2]
    X3 = np.random.multivariate_normal(mu3, np.diag(var3), num3)
    # 合并在一起
    X = np.vstack((X1, X2, X3))
    # 显示数据
    plt.figure(figsize=(10, 8))
    plt.axis([-10, 15, -5, 15])
    plt.scatter(X1[:, 0], X1[:, 1], s=5)
    plt.scatter(X2[:, 0], X2[:, 1], s=5)
    plt.scatter(X3[:, 0], X3[:, 1], s=5)
    #plt.show()
    return X


if __name__ == '__main__':
    # 生成数据
    true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
    true_Var = [[1, 3], [2, 2], [6, 2]]
    X = generate_X(true_Mu, true_Var)

    cluster_index = DBSCAN(X,eps=0.5,Minpts=15)
    Point_Show(X,cluster_index)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值