C++实现AGNES算法
该算法是自底而上的算法,他的主要原理是所有数据每个样本看成一个初始聚类,在算法运行过程中不断找出距离自己最近的聚类之后合并,直到达到自己设定的聚类个数,所用数据集为python的鸢尾花数据,伪代码如下:
C++代码如下
//DataPoint.h 保存基础数据的类
#ifndef _DATAPOINT_H_
#define _DATAPOINT_H_
#include <vector>
#include <string>
#include <iostream>
#include<map>
using namespace std;
class DataPoint
{
public:
DataPoint() {}
~DataPoint() {}
vector<double> GetData(); //获取保存数据
string GetName(); //获取花卉名字
int GetClusterId(); //获取聚类id
void SetClusterId(int number);//设置聚类id
void SetName(string str);//设置花卉名字
void SetData(vector<double> v);//设置保存数据
private:
vector<double> Data; //保存数据
string name; //花卉名字
int clusterId;//聚类id
};
#endif // _DATAPOINT_H_
//DataPoint.cpp
#include "DataPoint.h"
void DataPoint::SetClusterId(int number)
{
this->clusterId = number;
}
void DataPoint::SetData(vector<double> v)
{
for (int i = 0; i < v.size(); i++)
{
Data.push_back(v[i]);
}
}
void DataPoint::SetName(string name)
{
this->name = name;
}
vector<double> DataPoint::GetData()
{
return this->Data;
}
string DataPoint::GetName()
{
return this->name;
}
int DataPoint::GetClusterId()
{
return this->clusterId;
}
//AGNES.cpp
#ifndef _AGNES_H_
#define _AGNES_H_
#include <iostream>
#include<string>
#include<vector>
#include <fstream>
#include <sstream>
#include "DataPoint.h"
using namespace std;
//将string数据转为数字
template <class Type>
Type stringToNum(const string& str)
{
istringstream iss(str);
Type num;
iss >> num;
return num;
}
class AGNES
{
public:
AGNES(double ThreshordDis, int MaxClu) :ThreshordDis(ThreshordDis), MaxClu(MaxClu) {}
~AGNES() {}
void StartAgnes(); //算法核心
void GetData();//获取数据
void print();//打印信息
private:
double GetDistance(DataPoint &point, DataPoint &point2); //得到两个数据之间的距离
double GetCDistance(vector<DataPoint> C, vector<DataPoint> C2); //得到两个聚类之间的最小距离
double GetAVGDistance(vector<DataPoint> C, vector<DataPoint> C2);//得到两个聚类之间的平均距离
vector<DataPoint> DataBase; //保存所有数据
vector<vector<DataPoint>> CluData; //保存所有簇类数据
double ThreshordDis; //这里暂时无用,可以省略
int MaxClu; //最终获得聚类数量
int p; //存储最初的聚类的个数
};
#endif
//AGNES.cpp
#include "AGNES.h"
static double DataMap[10000][10000]; //保存两两聚类之间的距离
void AGNES::GetData()
{
ifstream file;
string line;
file.open("iris.csv",ios::in);
if (file.fail())
{
cout << "文件打开失败" << endl;
}
while (getline(file, line))
{
stringstream ss(line);
string str;
vector<string> temp;
vector<double> v;
DataPoint d;
while (getline(ss, str, ','))
{
temp.push_back(str);
}
for (int i = 1; i < temp.size()-1; i++)
{
v.push_back(stringToNum<double>(temp[i]));
}
//初始化数据
d.SetData(v);
d.SetName(temp[temp.size() - 1]);
d.SetClusterId(stringToNum<int>(temp[0])-1);//因为数据是从1开始编号的所以这里-1使其从0开始编号
DataBase.push_back(d);
}
}
void AGNES::StartAgnes()
{
//获得每个数据距离其他数据的距离
for (int i = 0; i < DataBase.size(); i++)
{
vector<DataPoint> t;
t.push_back(DataBase[i]);
CluData.push_back(t);
}
//DataMap数组保存两个聚类之间的距离
for (int i = 0; i < CluData.size(); i++)
{
vector<DataPoint> temp;
for (int j =i+1; j < CluData.size(); j++)
{
double dis = GetAVGDistance(CluData[i], CluData[j]);
DataMap[i][j] = dis;
DataMap[j][i] = dis;
}
}
p = DataBase.size(); //设置聚类个数
while (p > MaxClu)
{
double Temp = 9999;
int Find_i = 0, Find_j = 0;
//寻找最小值点
for (int i = 0; i < p; i++)
{
for (int j = i+1; j < p; j++)
{
if (DataMap[i][j] < Temp)
{
Temp = DataMap[i][j];
Find_i = i;
Find_j = j;
}
}
}
int NewId = CluData[Find_i][0].GetClusterId(); //获取当前簇的聚类号码
//将寻找到的最小点的两个簇合并 A1+B1->A1
//并将两个簇的簇类号码统一
for (int j = 0; j < CluData[Find_j].size(); j++)
{
CluData[Find_j][j].SetClusterId(NewId);
CluData[Find_i].push_back(CluData[Find_j][j]);
}
//重编号矩阵
for (int i = Find_j + 1; i < p; i++)
{
for (int j = 0; j < CluData[i].size(); j++)
{
CluData[i][j].SetClusterId(CluData[i][j].GetClusterId() - 1);
}
}
//距离矩阵重置
for (int i = Find_j; i < CluData.size()-1; i++)
{
CluData[i] = CluData[i + 1];
}
//删除DataMap矩阵 行列
for (int i = 0; i < p; i++)
{
for (int j = Find_j; j < p-1; j++)
{
DataMap[i][j] = DataMap[i][j + 1];
}
}
for (int i = Find_j; i < p; i++)
{
for (int j = 0; j < p - 1; j++)
{
DataMap[i][j] = DataMap[i + 1][j];
}
}
p = p - 1;
//重新计算合并数据后的聚类与其他聚类之间的距离
for (int i = 0; i < p; ++i)
{
double dis = GetAVGDistance(CluData[Find_i], CluData[i]);
if (DataMap[Find_i][i] != dis)
{
DataMap[Find_i][i] = dis;
DataMap[i][Find_i] = dis;
}
}
}
}
//打印数据
void AGNES::print()
{
for (int i = 0; i < p; i++)
{
cout << "聚类编号:" << i << endl;
for (int j = 0; j < CluData[i].size(); j++)
{
vector<double> dat = CluData[i][j].GetData();
for (int s = 0; s < dat.size(); s++)
{
cout << dat[s] << " ";
}
cout << CluData[i][j].GetName() << " ";
cout << CluData[i][j].GetClusterId();
cout << endl;
}
}
}
//两个数据之间的距离
double AGNES::GetDistance(DataPoint &point, DataPoint &point2)
{
double sum = 0;
for (int i = 0; i < point.GetData().size(); i++)
{
sum += (point.GetData()[i] - point2.GetData()[i])*(point.GetData()[i] - point2.GetData()[i]);
}
double result = sqrt(sum);
return result;
}
//两个聚类之间最小的距离
double AGNES::GetCDistance(vector<DataPoint> C, vector<DataPoint> C2)
{
double MinDis = 9999;
for (int i = 0; i < C.size(); i++)
{
for (int j = 0; j < C2.size(); j++)
{
double dis = GetDistance(C[i], C2[j]);
if (dis < MinDis)
{
MinDis = dis;
}
}
}
return MinDis;
}
//两个聚类的平均距离
double AGNES::GetAVGDistance(vector<DataPoint> C, vector<DataPoint> C2)
{
double temp[4];
double temp2[4];
for (int i = 0; i < C.size(); i++)
{
temp[0] += C[i].GetData()[0];
temp[1] += C[i].GetData()[1];
temp[2] += C[i].GetData()[2];
temp[3] += C[i].GetData()[3];
}
temp[0] = temp[0] / C.size();
temp[1] = temp[1] / C.size();
temp[2] = temp[2] / C.size();
temp[3] = temp[3] / C.size();
for (int i = 0; i < C2.size(); i++)
{
temp2[0] += C2[i].GetData()[0];
temp2[1] += C2[i].GetData()[1];
temp2[2] += C2[i].GetData()[2];
temp2[3] += C2[i].GetData()[3];
}
temp2[0] = temp2[0] / C2.size();
temp2[1] = temp2[1] / C2.size();
temp2[2] = temp2[2] / C2.size();
temp2[3] = temp2[3] / C2.size();
int sum = 0;
for (int i = 0; i < 4; i++)
{
sum += ((temp[i] - temp2[i])*(temp[i] - temp2[i]));
}
return sqrt(sum);
}