/*
num_class: 聚类数
num_data:数据个数
dimension:数据维度(每个数据是多少维的)
data:待聚类数据指针
cluster_center:聚类中心指针
max_error:前后两次误差降低到此值迭代终止
max_iters:最大迭代次数
1、随机初始化聚类中心
2、根据聚类中心计算每个样本属于哪个聚类中心
3、根据聚出来数据重新计算每个类新的聚类中心
重复2,3步骤
*/
void kmeans_cluster(const int num_class, const int num_data, const int dimension, double **data, double **cluster_center, double &max_error, int max_iters = 1000)
{
//initial cluster center
//data[num_data][dimension], cluster_center[num_class][dimension], cannot used as ptr
//vector<vector<int>> cluster_mark =>cluster_mark[num_class][num_data_each_class]
for (int i = 0; i < num_class; i++)
{
for (int j = 0; j < dimension; j++)
{
cluster_center[i][j] = data[i][j];
}
}
for (int iter = 0; iter < max_iters; iter++)
{
double curr_error = 0;
//calculate class of each data, cluster_mark
std::vector<std::vector<int>> cluster_mark;
for (int i = 0; i < num_data; i++)
{
double sum[10] = { 0 };//此处并不是想写10,而是dimension,但是数组声明时元素个数不允许用变量
for (int c = 0; c < num_class; c++)
{
for (int d = 0; d < dimension; d++)
{
sum[c] = abs(data[i][d] - cluster_center[c][d])+ sum[c];
}
}
cluster_mark[*std::min_element(sum, sum + num_data)].push_back(i);
}
//re-calcute center, cluster_center[num_class][dimension],difference between two iters, max_error
for (int c = 0; c < num_class; c++)
{
for (int d = 0; d < dimension; d++)
{
double mean_d = 0;
for (int i = 0; i < cluster_mark[c].size(); i++)
{
mean_d = mean_d + data[cluster_mark[c][i]][d];
}
mean_d = mean_d / cluster_mark[c].size();
curr_error = curr_error + abs(cluster_center[c][d] - mean_d);
cluster_center[c][d] = mean_d;
}
}
if (curr_error < max_error)
{
break;
}
}
}
练习代码,如有错误请指出