代码原始出处:http://blog.csdn.net/k76853/article/details/50440182zhouxianen,20170322
修改:
- 能准确分类边界点、核心点和噪音点,并输出
- 维度扩展,原来程序只能读取、聚类和显示二维特征数据,扩展后,可以处理n维数据
- 定义了init()函数初始化化类成员变量(也可以不用)
- 领域半径eps判断条件修改为小于等于,这是为保证与matlab版本形式一致
输入数据类型:xx.txt文件
输入txt文件格式如下(数据维度可以是n维):
1.400000,0.200000
1.400000,0.200000
1.300000,0.200000输出结果:控制台输出,同时输出clustering.txt文件 输出txt文件格式如下:
1.4,0.2,1
1.4,0.2,1
1.3,0.2,1 其中第三列为分类标记,数据之间用逗号分开 本文件中附上了两组组测试数据和测试结果(测试数据来源:http://yarpiz.com/255/ypml110-dbscan-clustering)
二维测试数据:dataPoint2.txt 四维测试数据:dataPoint4.txt
输出结果:clustering2.txt,clustering4.txt (eps=0.25,MinPts=3时结果)
#include "DBSCAN.h"
int main(int argc, char** argv) {
vector<point> dataset = openFile("788points.txt");
DBSCAN(dataset, 1.2,4);
//cout << pointType_UNDO << pointType_NOISE
// <<pointType_BORDER
// <<pointType_CORE << endl;
return 0;
}
//#if !defined(_DBSCAN_H_INCLUDED_)
//#define _DBSCAN_H_INCLUDED_
#include <iostream>
#include <sstream>
#include <fstream>
#include <vector>
#include <ctime>
#include <cstdlib>
#include <limits>
#include <cmath>
#include <stack>
#include <map>
using namespace std;
enum//枚举类型默认值为0-3
{
pointType_UNDO,
pointType_NOISE,//噪声
pointType_BORDER,//边界点
pointType_CORE//核心点
};
class point {
public:
float x;
float y;//~x,y坐标
vector<float> xn;
//vector<float> yn;
int cluster;//第几个簇
int pointType; //1 noise 2 border 3 core
int pts; //每个点周围包含的点数
int corePointID;//核心点的标号
vector<int> corepts;
int visited;//点是否被遍历,1代表被遍历,0代表没被遍历
void init();//初始化
//point();
//point(float a, float b, int c) {
// x = a;
// y = b;
// cluster = c;
//};
point(vector<float> an, int c) {//three or more dimension
this->xn = an;
this->cluster = c;//
};
};
float stringToFloat(string i);//把字符串转换为浮点数
vector<point> openFile(const char* dataset);//打开文件
float squareDistance(point a, point b);//计算距离
float squareDistanceVect(point a, point b);多维数据算欧氏距离
void DBSCAN(vector<point> dataset, float Eps, int MinPts);
//#endif
#include "DBSCAN.h"
//#include "DBSCAN.h"
int clusterID = 0;
void point::init()
{
cluster = 0;
pointType = pointType_UNDO;//pointType_NOISE pointType_UNDO
pts = 0;
visited = 0;
corePointID = -1;
}
float stringToFloat(string i) {//读取的是一个字符串,转换为浮点类型计算
stringstream sf;
float score = 0;
sf << i;//~写入字符串i
sf >> score;//~读取字符串i
return score;
}
vector<point> openFile(const char* dataset) {
fstream file;//~读写皆可
file.open(dataset, ios::in);//~以读的方式打开文件
if (!file)
{
cout << "Open File Failed!" << endl;
vector<point> a;
return a;
}
vector<point> data;
int i = 1;
//~eof()检查是否到达文件末尾
while (1) {//!file.eof()
string temp;
file >> temp; //读取一行数据
//int split = temp.find(',', 0);//~在字符串temp中,从下标0开始读,找到',',并返回逗号在字符串中的下标位置
//二维数据直接用point(float x,float y,int c)
//point p(stringToFloat(temp.substr(0,split)),stringToFloat(temp.substr(split+1,temp.length()-1)),i++);
//三维及以上多维数据需要以下处理过程
int numSpit = 0;
vector <int> splitN1, splitN2;
numSpit = count(temp.begin(), temp.end(), ',');//~计算每一行数据里有几个逗号
if (numSpit > 0) {
vector<float> xn;
int m = 0;
int cPos = 0;
while (1) {//
int splitTemp = temp.find_first_of(',', cPos);//~在temp字符串的第cpos位开始查找逗号,返回逗号的位置下标
//~nops是字符串结束的位置,只要!=,就说明字符串还没读到末尾结束
m++;//m的作用是记录压入了几维坐标
splitN1.push_back(cPos);//将cpos加入splitN1 第一次:0
cPos = splitTemp + 1;//~逗号右边数据的初始位置下标(三维及以上这一步就很有用了,二维不需要)
splitN2.push_back(splitTemp);//~将逗号下标加入splitN2
if(m==numSpit+1)//坐标维度等于逗号数+1说明所有的数都压入,跳出循环
{
break;
}
}
for (int m = 0; m <=numSpit; m++) {
xn.push_back(stringToFloat(temp.substr(splitN1[m], splitN2[m])));//将一行的前n-1个数放进xn里
}
//xn.push_back(stringToFloat(temp.substr(splitN2[numSpit - 1] + 1, temp.length() - 1)));//将一行的最后一个数放进xn
point p(xn, i++);
data.push_back(p);
//data[i] = p;
}
if (file.eof())
break;
//data.push_back(p);
}
file.close();
cout << "successful!" << endl;
return data;
}
//~二维数据算欧式距离
float squareDistance(point a, point b) {
return sqrt((a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y));//计算欧氏距离
}
//多维数据算欧氏距离
float squareDistanceVect(point a, point b) {
vector <float> xn1 = a.xn;
vector <float> yn1 = b.xn;
float sumSqrt = 0;
for (int i = 0; i < xn1.size(); i++) {
sumSqrt = sumSqrt + (xn1[i] - yn1[i]) * (xn1[i] - yn1[i]);
}
return sqrt(sumSqrt);
}
void DBSCAN(vector<point> dataset, float Eps, int MinPts) {
int len = dataset.size();//有多少个点
//cout << len << endl;
for (int i = 0; i < len; i++)//参数初始化
{
dataset[i].init();
}
vector<vector <float>> distP2P(len);
//vector<vector <float>> distP2P(vector <float>(len));
//calculate pts计算每个点在eps范围内有几个点
cout << "calculate pts" << endl;
for (int i = 0; i < len; i++) {
for (int j = 0; j < len; j++) {
float distance = squareDistanceVect(dataset[i], dataset[j]);//squareDistanceVect squareDistance
distP2P[i].push_back(distance);//disp for debug
if (distance <= Eps) {
dataset[i].pts++;
}
}
}
//core point 核心点,pts大于minPts的时候,该点为核心点
cout << "core point " << endl;
vector<point> corePoint;
for (int i = 0; i < len; i++) {
int tempPts = dataset[i].pts;
if (tempPts >= MinPts) {
dataset[i].pointType = pointType_CORE;
dataset[i].corePointID = i;
corePoint.push_back(dataset[i]);
}
}
cout << "joint core point" << endl;
//joint core point
int numCorePoint = corePoint.size(); //core point number核心点的数量
for (int i = 0; i < numCorePoint; i++) {
for (int j = 0; j < numCorePoint; j++) {
float distTemp = distP2P[corePoint[i].corePointID][corePoint[j].corePointID];//display for debug distTemp相当于二维数组,distTemp[i][j]即核心点i和j之间的距离
if (distTemp <= Eps) {//squareDistance(corePoint[i],corePoint[j])
corePoint[i].corepts.push_back(j);//other point orderID link to core point
}//把每一个在核心点领域的核心点放到一起
}
}
for (int i = 0; i < numCorePoint; i++) {//遍历所有的核心点
stack<point*> ps;//临时存储核心点
if (corePoint[i].visited == 1) continue;
clusterID++;
corePoint[i].cluster = clusterID;//create a new cluster
ps.push(&corePoint[i]);
point* v;
while (!ps.empty()) {
v = ps.top();
v->visited = 1;
ps.pop();
for (int j = 0; j < v->corepts.size(); j++) {//最开始归类的一簇进行遍历
if (corePoint[v->corepts[j]].visited == 1) continue;
corePoint[v->corepts[j]].cluster = corePoint[i].cluster;
//dataset[v->corepts[j]].cluster= corePoint[i].cluster;
corePoint[v->corepts[j]].visited = 1;
ps.push(&corePoint[v->corepts[j]]);
}
}
}
cout << "border point,joint border point to core point" << endl;
//border point,joint border point to core point
int k = 0;//k用来在dataset中统计是第几个核心点
for (int i = 0; i < len; i++) {
if (dataset[i].pointType == pointType_CORE)//如果该点是核心点,在上面已经访问过了,就不再访问,因为核心点不可能是边界点,没必要再访问一次
{
dataset[i].cluster = corePoint[k++].cluster;//遍历到第k个核心点时,把属于的簇id给原来的dataset
continue;
}
for (int j = 0; j < numCorePoint; j++) {
float distTemp = distP2P[i][corePoint[j].corePointID];
if (distTemp <= Eps) {
dataset[i].pointType = pointType_BORDER;
dataset[i].cluster = corePoint[j].cluster;
break;
}
}
}
cout << "output" << endl;
//output
//display
//save in .txt format named clustering.txt
fstream clustering;//save .txt
clustering.open("clustering.txt", ios::out);//save .txt
char dispInfo[500];
int dataDim = dataset[0].xn.size();//data dimension
for (int i = 0; i < len; i++) {
//%11.4lf,%11.4lf,%11.4lf,%11.4lf
sprintf(dispInfo, "第%3d个数据:", i + 1);
for (int j = 0; j < dataDim; j++) {
char dataSrc[30];
if (j == 0)
sprintf(dataSrc, "%11.4lf", dataset[i].xn[j]);
else
sprintf(dataSrc, ",%11.4lf", dataset[i].xn[j]);
strcat(dispInfo, dataSrc);
}
char dataClust[30];
sprintf(dataClust, ",%4d", dataset[i].cluster);
strcat(dispInfo, dataClust);
char datasrc1[10];
sprintf(datasrc1, " %d\n", dataset[i].pointType);
strcat(dispInfo, datasrc1);
cout << dispInfo; //display in cmd window
clustering << dispInfo;//save the results in .txt format named clustering.txt
}
clustering.close();//save .txt
}