聚类算法c++代码实现(DBSCAN)

本文详细介绍了DBSCAN聚类算法的C++实现,包括代码优化,支持处理多维数据,以及边界点、核心点和噪音点的准确分类。通过读取xx.txt文件,输出clustering.txt结果文件,展示了算法在不同维度数据上的应用,如dataPoint2.txt和dataPoint4.txt的测试数据。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

代码原始出处:http://blog.csdn.net/k76853/article/details/50440182zhouxianen,20170322
修改:

  • 能准确分类边界点、核心点和噪音点,并输出
  • 维度扩展,原来程序只能读取、聚类和显示二维特征数据,扩展后,可以处理n维数据
  • 定义了init()函数初始化化类成员变量(也可以不用)
  • 领域半径eps判断条件修改为小于等于,这是为保证与matlab版本形式一致

输入数据类型:xx.txt文件

输入txt文件格式如下(数据维度可以是n维):
1.400000,0.200000
1.400000,0.200000
1.300000,0.200000

输出结果:控制台输出,同时输出clustering.txt文件 输出txt文件格式如下:
1.4,0.2,1
1.4,0.2,1
1.3,0.2,1 其中第三列为分类标记,数据之间用逗号分开 本文件中附上了两组组测试数据和测试结果(测试数据来源:http://yarpiz.com/255/ypml110-dbscan-clustering)
二维测试数据:dataPoint2.txt 四维测试数据:dataPoint4.txt
输出结果:clustering2.txt,clustering4.txt (eps=0.25,MinPts=3时结果)

#include "DBSCAN.h"

int main(int argc, char** argv) {
	vector<point> dataset = openFile("788points.txt");
	DBSCAN(dataset, 1.2,4);
	//cout << pointType_UNDO << pointType_NOISE
	//	<<pointType_BORDER
	//	<<pointType_CORE << endl;

	return 0;
}
//#if !defined(_DBSCAN_H_INCLUDED_)
//#define _DBSCAN_H_INCLUDED_

#include <iostream>
#include <sstream>
#include <fstream>
#include <vector>
#include <ctime>
#include <cstdlib>
#include <limits>
#include <cmath>
#include <stack>
#include <map>

using namespace std;

enum//枚举类型默认值为0-3
{
	pointType_UNDO,
	pointType_NOISE,//噪声
	pointType_BORDER,//边界点
	pointType_CORE//核心点

};



class point {
public:
	float x;
	float y;//~x,y坐标
	vector<float> xn;
	//vector<float> yn;
	int cluster;//第几个簇
	int pointType;  //1 noise 2 border 3 core
	int pts;        //每个点周围包含的点数
	int corePointID;//核心点的标号
	vector<int> corepts;
	int  visited;//点是否被遍历,1代表被遍历,0代表没被遍历
	void init();//初始化
	//point();
	//point(float a, float b, int c) {
	//	x = a;
	//	y = b;
	//	cluster = c;
	//};
	point(vector<float>  an, int c) {//three or more dimension
		this->xn = an;
		this->cluster = c;//
	};
};


float stringToFloat(string i);//把字符串转换为浮点数
vector<point> openFile(const char* dataset);//打开文件
float squareDistance(point a, point b);//计算距离
float squareDistanceVect(point a, point b);多维数据算欧氏距离
void DBSCAN(vector<point> dataset, float Eps, int MinPts);


//#endif

#include "DBSCAN.h"
//#include "DBSCAN.h"

int clusterID = 0;

void point::init()
{
	cluster = 0;
	pointType = pointType_UNDO;//pointType_NOISE pointType_UNDO
	pts = 0;
	visited = 0;
	corePointID = -1;
}


float stringToFloat(string i) {//读取的是一个字符串,转换为浮点类型计算
	stringstream sf;
	float score = 0;
	sf << i;//~写入字符串i
	sf >> score;//~读取字符串i
	return score;
}

vector<point> openFile(const char* dataset) {
	fstream file;//~读写皆可
	file.open(dataset, ios::in);//~以读的方式打开文件
	if (!file)
	{
		cout << "Open File Failed!" << endl;
		vector<point> a;
		return a;
	}
	vector<point> data;

	int i = 1;
	//~eof()检查是否到达文件末尾
	while (1) {//!file.eof()
		string temp;
		file >> temp; //读取一行数据
		//int split = temp.find(',', 0);//~在字符串temp中,从下标0开始读,找到',',并返回逗号在字符串中的下标位置
		//二维数据直接用point(float x,float y,int c)
		//point p(stringToFloat(temp.substr(0,split)),stringToFloat(temp.substr(split+1,temp.length()-1)),i++);

		//三维及以上多维数据需要以下处理过程
		int numSpit = 0;
		vector <int> splitN1, splitN2;
		numSpit = count(temp.begin(), temp.end(), ',');//~计算每一行数据里有几个逗号
		if (numSpit > 0) {
			vector<float> xn;
			int m = 0; 
			int cPos = 0;
			while (1) {//
				int splitTemp = temp.find_first_of(',', cPos);//~在temp字符串的第cpos位开始查找逗号,返回逗号的位置下标
				//~nops是字符串结束的位置,只要!=,就说明字符串还没读到末尾结束
					m++;//m的作用是记录压入了几维坐标
					splitN1.push_back(cPos);//将cpos加入splitN1 第一次:0
					cPos = splitTemp + 1;//~逗号右边数据的初始位置下标(三维及以上这一步就很有用了,二维不需要)
					splitN2.push_back(splitTemp);//~将逗号下标加入splitN2 
				if(m==numSpit+1)//坐标维度等于逗号数+1说明所有的数都压入,跳出循环
				{
					break;
				}		
			}
			for (int m = 0; m <=numSpit; m++) {
				xn.push_back(stringToFloat(temp.substr(splitN1[m], splitN2[m])));//将一行的前n-1个数放进xn里
			}
			//xn.push_back(stringToFloat(temp.substr(splitN2[numSpit - 1] + 1, temp.length() - 1)));//将一行的最后一个数放进xn
			point p(xn, i++);
			data.push_back(p);
			//data[i] = p;
		}
		if (file.eof())
			break;
		//data.push_back(p);
	}
	file.close();
	cout << "successful!" << endl;
	return data;
}

//~二维数据算欧式距离
float squareDistance(point a, point b) {
	return sqrt((a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y));//计算欧氏距离
}

//多维数据算欧氏距离
float squareDistanceVect(point a, point b) {
	vector <float> xn1 = a.xn;
	vector <float> yn1 = b.xn;

	float sumSqrt = 0;
	for (int i = 0; i < xn1.size(); i++) {
		sumSqrt = sumSqrt + (xn1[i] - yn1[i]) * (xn1[i] - yn1[i]);
	}
	return sqrt(sumSqrt);
}
void DBSCAN(vector<point> dataset, float Eps, int MinPts) {
	int len = dataset.size();//有多少个点
	//cout << len << endl;
	for (int i = 0; i < len; i++)//参数初始化
	{
		dataset[i].init();
	}
	vector<vector <float>> distP2P(len);
	//vector<vector <float>> distP2P(vector <float>(len));

	//calculate pts计算每个点在eps范围内有几个点
	cout << "calculate pts" << endl;
	for (int i = 0; i < len; i++) {
		for (int j = 0; j < len; j++) {
			float distance = squareDistanceVect(dataset[i], dataset[j]);//squareDistanceVect squareDistance 
			distP2P[i].push_back(distance);//disp for debug
			if (distance <= Eps) {
				dataset[i].pts++;
			}
		}
	}
	//core point 核心点,pts大于minPts的时候,该点为核心点
	cout << "core point " << endl;
	vector<point> corePoint;
	for (int i = 0; i < len; i++) {
		int tempPts = dataset[i].pts;
		if (tempPts >= MinPts) {
			dataset[i].pointType = pointType_CORE;
			dataset[i].corePointID = i;
			corePoint.push_back(dataset[i]);
		}
	}

	cout << "joint core point" << endl;
	//joint core point
	int numCorePoint = corePoint.size(); //core point number核心点的数量
	for (int i = 0; i < numCorePoint; i++) {
		for (int j = 0; j < numCorePoint; j++) {
			float distTemp = distP2P[corePoint[i].corePointID][corePoint[j].corePointID];//display for debug  distTemp相当于二维数组,distTemp[i][j]即核心点i和j之间的距离
			if (distTemp <= Eps) {//squareDistance(corePoint[i],corePoint[j])
				corePoint[i].corepts.push_back(j);//other point orderID link to core point
			}//把每一个在核心点领域的核心点放到一起
		}
	}
	for (int i = 0; i < numCorePoint; i++) {//遍历所有的核心点
		stack<point*> ps;//临时存储核心点
		if (corePoint[i].visited == 1) continue;
		clusterID++;
		corePoint[i].cluster = clusterID;//create a new cluster
		ps.push(&corePoint[i]);
		point* v;
		while (!ps.empty()) {
			v = ps.top();
			v->visited = 1;
			ps.pop();
			for (int j = 0; j < v->corepts.size(); j++) {//最开始归类的一簇进行遍历
				if (corePoint[v->corepts[j]].visited == 1) continue;
				corePoint[v->corepts[j]].cluster = corePoint[i].cluster;
				//dataset[v->corepts[j]].cluster= corePoint[i].cluster;
				corePoint[v->corepts[j]].visited = 1;
				ps.push(&corePoint[v->corepts[j]]);
			}
		}
	}

	cout << "border point,joint border point to core point" << endl;
	//border point,joint border point to core point
	int k = 0;//k用来在dataset中统计是第几个核心点
	for (int i = 0; i < len; i++) {
		if (dataset[i].pointType == pointType_CORE)//如果该点是核心点,在上面已经访问过了,就不再访问,因为核心点不可能是边界点,没必要再访问一次
		{
			dataset[i].cluster = corePoint[k++].cluster;//遍历到第k个核心点时,把属于的簇id给原来的dataset
			continue;
		}
		for (int j = 0; j < numCorePoint; j++) {
			float distTemp = distP2P[i][corePoint[j].corePointID];
			if (distTemp <= Eps) {
				dataset[i].pointType = pointType_BORDER;
				dataset[i].cluster = corePoint[j].cluster;
				break;
			}
		}
	}
	cout << "output" << endl;
	//output
	//display  
	//save in .txt format named clustering.txt
	fstream clustering;//save .txt
	clustering.open("clustering.txt", ios::out);//save .txt
	char dispInfo[500];
	int dataDim = dataset[0].xn.size();//data dimension
	for (int i = 0; i < len; i++) {
		//%11.4lf,%11.4lf,%11.4lf,%11.4lf
		sprintf(dispInfo, "第%3d个数据:", i + 1);
		for (int j = 0; j < dataDim; j++) {
			char dataSrc[30];
			if (j == 0)
				sprintf(dataSrc, "%11.4lf", dataset[i].xn[j]);
			else
				sprintf(dataSrc, ",%11.4lf", dataset[i].xn[j]);
			strcat(dispInfo, dataSrc);
		}
		char dataClust[30];
		sprintf(dataClust, ",%4d", dataset[i].cluster);
		strcat(dispInfo, dataClust);
		char datasrc1[10];
		sprintf(datasrc1, "    %d\n", dataset[i].pointType);
		strcat(dispInfo, datasrc1);
		cout << dispInfo;      //display in cmd window
		clustering << dispInfo;//save the results in .txt format named clustering.txt

	}
	clustering.close();//save .txt

}
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值