dbscan模板类的c++实现

dbscan模板类的c++实现

好久没更新了,写个聚类凑凑数=.=

  • 写了个dbscan的c++模板类,可以用来处理简单的聚类,耗时也很少
  • https://zh.wikipedia.org/wiki/DBSCAN中的算法实现,原理很简单,就不重复了
  • 使用了距离矩阵存储所有点的距离,减少重复的距离计算
  • 可选:添加OpenMP并行运算计算距离,但实测好像运行耗时没怎么减少,可能测试用例太简单
  • 其中主要的耗时应该还是距离矩阵的计算,有两重for循环增加耗时。如果是数据量比较大的话,可以使用kdtree等结构来优化耗时,kdtree查询最近点更方便,这样聚类时判断邻点是否满足要求时会更快一些
  • 可以找一点大一点的聚类数据来做聚类
  • 直接上代码了,后面就不介绍了,有注释应该好看
  • 项目代码 https://gitee.com/leox24/dbscan_template
  • 参考代码 https://github.com/buresu/DBSCAN

dbscan.h

/**
 * @file dbscan.h
 * @author tianchangxin
 * @brief Implementation of DBSCAN algorithm from https://zh.wikipedia.org/wiki/DBSCAN
 */
#pragma once
#include <Eigen/Core>
#include <Eigen/Dense>
#include <algorithm>
#include <deque>
#include <iostream>
#include <vector>

using std::deque;
using std::vector;

struct Point2d
{
    float x = 0.0f;
    float y = 0.0f;
};

// namespace TBD
// {

/**
 * @brief Density-based spatial clustering of applications with noise.
 * implementation of DBSCAN algorithm from https://zh.wikipedia.org/wiki/DBSCAN.
 * using distance matrix to reduce repeated computation of distance, which would increase space complexity o(n^2).
 */
template <class T> class DBScan
{
  private:
    // minimum number of points in a cluster
    int _min_num;

    // maximum distance threshold of two points
    double _eps;

    // cluster attributes of each point
    vector<int> _labels;

    // save the distance between all points
    Eigen::MatrixXd _distance_mat;

    // TODO: you should define the distance function of your struct/class
    /**
     * @brief Get the Distance object
     * @param p1 object/point 1
     * @param p2 object/point 2
     * @return double 
     */
    inline double GetDistance(const Eigen::Vector2d& p1, const Eigen::Vector2d& p2)
    {
        return (p1 -p2).norm();
    }
    inline double GetDistance(const Point2d& p1, const Point2d& p2)
    {
        return sqrt(std::pow((p1.x - p2.x), 2) + pow((p1.y - p2.y), 2));
    }

    /**
     * @brief core function of dbscan
     * @param all_pts, all points/objects
     * @return int, number of clusters
     */
    int Run(const vector<T> &all_pts);

    /**
     * @brief find neighbors within a fixed distance, and generate a cluster
     * @param pt_idx, point index of all points
     * @param cluster_idx, cluster index
     * @return bool, true:cluster is generated, false:point/object is noise
     */
    bool ExpandCluster(int pt_idx, int &cluster_idx);

  public:
    /**
     * @brief Construct a new DBScan object
     * @param eps, define maximum distance threshold of two points
     * @param min_num, define minimum number of points in a cluster
     */
    DBScan(float eps, int min_num) : _eps(eps), _min_num(min_num){}

    /**
     * @brief Destroy the DBScan object
     */
    ~DBScan(){}

    /**
     * @brief Get the Clusters object
     * @param all_pts, all points/objects
     * @return vector<vector<T>>, 0:noise points, 1-n:cluster points
     */
    vector<vector<T>> GetClusters(std::vector<T> all_pts);

    /**
     * @brief Get the Labels of object
     * @return vector<int>, get the cluster label corresponds to the index of each object
     */
    vector<int> GetLabels(){ return _labels;}
};

template <typename T> int DBScan<T>::Run(const vector<T> &all_pts)
{
    int size = all_pts.size();
    int cluster_idx = 1;

    // -1:noise, 0:unlabel, >1:cluster index
    _labels.clear();
    _labels.resize(size, 0);

    // 1. calculate the distance between each two points, opt(using OpenMP if data is big)
    _distance_mat = Eigen::MatrixXd::Zero(size, size);
// #pragma omp parallel for schedule(runtime)
    for (int i = 0; i < size; ++i)
    {
        for (int j = i; j < size; ++j)
        {
            if (i != j)
            {
                _distance_mat(i, j) = GetDistance(all_pts.data()[i], all_pts.data()[j]);
                _distance_mat(j, i) = _distance_mat(i, j);
            }
        }
    }
    // std::cout << _distance_mat << std::endl;

    // 2. do clustering
    for (size_t i = 0; i < size; i++)
    {
        if (_labels[i] != 0)
            continue;
        ExpandCluster(i, cluster_idx);
    }

    return cluster_idx - 1;
}

template <typename T> bool DBScan<T>::ExpandCluster(int pt_idx, int &cluster_idx)
{
    // 1.region query
    deque<int> seeds_idx;
    seeds_idx.emplace_back(pt_idx);
    for (size_t col = 0; col < _distance_mat.cols(); col++)
    {
    	if (col == pt_idx) continue;
        if (_distance_mat(pt_idx, col) < _eps)
        {
            seeds_idx.emplace_back(col);
        }
    }

    // 2.check point numbers of neighbors, whether its noise
    if (seeds_idx.size() < _min_num)
    {
        _labels[pt_idx] = -1;
        return false;
    }
    // 3.label point
    for (size_t i = 0; i < seeds_idx.size(); i++)
    {
        _labels[seeds_idx[i]] = cluster_idx;
    }

    // 4.do neighbors clustering
    seeds_idx.pop_front();
    while (!seeds_idx.empty())
    {
        auto &row = seeds_idx.front();
        // region query
        vector<int> temp_idx;
        temp_idx.reserve(_distance_mat.cols());
        for (size_t col = 0; col < _distance_mat.cols(); col++)
        {
            if (_distance_mat(row, col) <= _eps)
                temp_idx.emplace_back(col);
        }
        // check point numbers of neighbors, whether its noise
        if (temp_idx.size() > _min_num)
        {
            // label point
            for (size_t i = 0; i < temp_idx.size(); i++)
            {
                // if already label, abort
                if (_labels[temp_idx[i]] >= 1)
                    continue;
                if (_labels[temp_idx[i]] == 0)
                    seeds_idx.emplace_back(temp_idx[i]);
                _labels[temp_idx[i]] = cluster_idx;
            }
        }

        seeds_idx.pop_front();
    }

    cluster_idx++;
    return true;
}
template <typename T> vector<vector<T>> DBScan<T>::GetClusters(std::vector<T> all_pts)
{
    int clusters_num = this->Run(all_pts);
    vector<vector<T>> result(clusters_num + 1);

    for (size_t i = 0; i < _labels.size(); i++)
    {
        auto &label = _labels[i];
        if (label < 1)
        {
            // noise point, index=0
            result[label + 1].emplace_back(all_pts[i]);
            continue;
        }

        result[label].emplace_back(all_pts[i]);
    }

    return result;
}
// } // namespace TBD

main.cc

#include <iostream>
#include <chrono>
#include <gtest/gtest.h>
#include "dbscan.h"
using namespace std;

TEST(testcase, test_dbscan)
{
    vector<Eigen::Vector2d> points(10);
    vector<vector<Eigen::Vector2d> > result;

    points[0].x() = 20; points[0].y() = 21;
    points[1].x() = 20; points[1].y() = 25;
    points[2].x() = 28; points[2].y() = 22;
    points[3].x() = 30; points[3].y() = 52;
    points[4].x() = 26; points[4].y() = 70;
    points[5].x() = 30; points[5].y() = 75;
    points[6].x() = 0;  points[6].y() = 70;
    points[7].x() = 70; points[7].y() = 50;
    points[8].x() = 67; points[8].y() = 69;
    points[9].x() = 80; points[9].y() = 35;

    result.resize(4);
    result[0].emplace_back(points[6]);
    result[1].emplace_back(points[0]);
    result[1].emplace_back(points[1]);
    result[1].emplace_back(points[2]);
    result[2].emplace_back(points[3]);
    result[2].emplace_back(points[4]);
    result[2].emplace_back(points[5]);
    result[3].emplace_back(points[7]);
    result[3].emplace_back(points[8]);
    result[3].emplace_back(points[9]);

    DBScan<Eigen::Vector2d> dbscan(20.0, 3);
    vector<vector<Eigen::Vector2d>> res = dbscan.GetClusters(points);

    cout << "noise " << endl;
    for(auto& r : res[0])
        cout << r.transpose() << endl;
    for (size_t i = 1; i < res.size(); i++)
    {
        cout << "cluster " << i << endl;
        for (size_t j = 0; j < res[i].size(); j++)
        {
            cout << res[i][j].transpose() << endl;
        }

    }

    EXPECT_EQ(result, res);
}

int main(int argc, char** argv) {

    testing::InitGoogleTest(&argc, argv);
    return RUN_ALL_TESTS();
    return 0;
}

CMakeLists.txt

cmake_minimum_required(VERSION 2.8)
project(dbscan)
set(CMAKE_BUILD_TYPE RELEASE)
add_compile_options(-std=c++11)
set( CMAKE_CXX_FLAGS "-std=c++11 -O2")

find_package(OpenMP REQUIRED)

include_directories("./src")
include(GoogleTest)

file(GLOB_RECURSE PROJECT_HEADERS "src/*.h" "src/*.hpp")
file(GLOB_RECURSE PROJECT_SOURCES "src/*.cc" "src/*.cpp")

add_executable(${PROJECT_NAME} ${PROJECT_HEADERS} ${PROJECT_SOURCES})
target_link_libraries(${PROJECT_NAME} OpenMP::OpenMP_CXX gtest pthread)


  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
DBSCAN算法的C++实现需要以下步骤: 1. 定义一个Point类,用于存储每个点的坐标和聚类信息。 2. 定义一个DBSCAN类,包含以下成员函数: - init函数:初始化N个坐标点。 - regionQuery函数:计算每个点的邻域内的点。 - expandCluster函数:扩展聚类。 - dbscan函数:执行DBSCAN算法。 3. 在main函数中,创建一个DBSCAN对象并调用dbscan函数进行聚类。 下面是一个简单的DBSCAN算法C++实现的代码示例: ``` #include <iostream> #include <cstdlib> #include <ctime> #include <vector> #include <cmath> using namespace std; const int N = 100; // 坐标点数量 const double eps = 1.0; // 邻域半径 const int minPts = 5; // 最小邻域点数 class Point { public: double x, y; int cluster; }; class DBSCAN { public: Point point[N]; vector<int> neighborPts[N]; void init(int n) { srand((unsigned)time(NULL)); for(int i=0; i<n; i++) { point[i].x = rand() % (N+1); point[i].y = rand() % (N+1); point[i].cluster = -1; } } void regionQuery(int p) { neighborPts[p].clear(); for(int i=0; i<N; i++) { if(i == p) continue; double dist = sqrt(pow(point[i].x - point[p].x,2) + pow(point[i].y - point[p].y, 2)); if(dist <= eps) { neighborPts[p].push_back(i); } } } void expandCluster(int p, int c) { point[p].cluster = c; for(int i=0; i<neighborPts[p].size(); i++) { int np = neighborPts[p][i]; if(point[np].cluster == -1) { point[np].cluster = c; regionQuery(np); if(neighborPts[np].size() >= minPts) { expandCluster(np, c); } } } } void dbscan() { int c = 0; for(int i=0; i<N; i++) { if(point[i].cluster != -1) continue; regionQuery(i); if(neighborPts[i].size() < minPts) { point[i].cluster = 0; // 噪声点 } else { c++; expandCluster(i, c); } } } }; int main() { DBSCAN dbscan; dbscan.init(N); dbscan.dbscan(); for(int i=0; i<N; i++) { cout << "Point " << i << ": (" << dbscan.point[i].x << ", " << dbscan.point[i].y << ") Cluster: " << dbscan.point[i].cluster << endl; } return 0; } ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值