K均值聚类,K-means clustering
作用:无监督的实现数据分类
操作方式:迭代地将距离较近的点聚集在一起,形成簇,每个簇的数据点聚为一类
过程:1)初始化质心,有几个类就需要用户设定几个质心,好的初始化质心可减少迭代次数;
2)计算每个点到每个质心的距离(下面实验用的是欧氏距离);
3)比较每个点到各个质心的距离,距离最小则该点聚类的类标号为该质心的标号(如下图);
4)这样得到K个簇,计算每个簇的均值,作为该簇的新质心;
5)若K个新质心与K个原质心相同(没有质心改变),则聚类结束,否则,返回第2步。
以下是C++代码:
#include <iostream>
#include <vector>
#include <fstream>
using namespace std;
struct Point {
float x = 0.;
float y = 0.;
};
void loadData(string filename, vector<Point> &data){
fstream file;
file.open(filename, std::ios::in);
if (!file.is_open())
{
cout << "ERROR!\n";
exit(0);
}
while (!file.eof())
{
Point pt;
char msg[20];
memset(msg, 0, 20);
file >> msg;
pt.x = atof(msg);
memset(msg, 0, 20);
file >> msg;
pt.y = atof(msg);
data.push_back(pt);
}
file.close();
}
void initCenter(const vector<Point> &data, const int class_num, vector<Point> ¢er){
center.resize(class_num);
float max_x = -1000,min_x = 1000., y_aver = 0.;
for (int i = 0; i < data.size(); i++)
{
max_x = max_x > data[i].x ? max_x : data[i].x;
min_x = min_x < data[i].x ? min_x : data[i].x;
y_aver += data[i].y / data.size();
}
for (int i = 0 ; i < class_num; i++)
{
center[i].x = min_x + (max_x - min_x) / class_num * i;
center[i].y = y_aver;
}
}
bool centerIsChanged(const vector<Point> &c1, const vector<Point> &c2){
if (c1.size() != c2.size())
return true;
for (int i = 0; i < c1.size(); i++)
{
if (c1[i].x != c2[i].x || c1[i].y != c2[i].y)
return true;
}
return false;
}
vector<vector<Point>> kmeansCluster(const vector<Point> &data, vector<Point> ¢er){
vector<vector<float>> data_distance;
// 计算各数据点到各个质心的距离
for (int i = 0; i < center.size(); i++)
{
vector<float> distance_i;
for (int j = 0; j < data.size(); j++)
{
distance_i.push_back(
sqrt(pow(data[j].x - center[i].x, 2) + pow(data[j].y - center[i].y, 2)));
}
data_distance.push_back(distance_i);
}
vector<vector<Point>> cluster(center.size());//存储每个质心对应的簇
for (int i = 0; i < data.size(); i++)
{
int min_index = 0;
float min = data_distance[0][i];
for (int j = 0; j < center.size(); j++)
{
if (data_distance[j][i] < min)
{
min = data_distance[j][i];
min_index = j;//得到距离最小的类编号
}
}
cluster[min_index].push_back(data[i]);
}
//根据聚类结果更新质心位置
for (int i = 0; i < center.size(); i++)
{
center[i].x = 0;
center[i].y = 0;
for (int j = 0; j < cluster[i].size(); j++)
{
center[i].x += cluster[i][j].x / cluster[i].size();
center[i].y += cluster[i][j].y / cluster[i].size();
}
}
return cluster;
}
int main(){
vector<Point> data;
data.clear();
loadData("data.txt", data);
vector<Point> center(3);
initCenter(data, 3, center);
while (1)
{
vector<Point> center_temp;
center_temp = center;
kmeansCluster(data, center);
if (!centerIsChanged(center, center_temp))
break;
cout << "迭代结果:\n";
for (int i = 0; i < center.size(); i++)
cout << center[i].x << " " << center[i].y << endl;
}
/*while (1)
{
vector<vector<float>> distance_value;
for (int i = 0; i < 3; i++)
{
vector<float> distance;
for (int j = 0; j < data.size(); j++)
{
float dis = sqrt(pow(data[j].x - center[i].x, 2)
+ pow(data[j].y - center[i].y, 2));
distance.push_back(dis);
}
distance_value.push_back(distance);
}
vector<vector<Point>> cluster(3);
for (int i = 0; i < data.size(); i++)
{
if (distance_value[0][i] < distance_value[1][i])
{
if (distance_value[0][i] < distance_value[2][i])
cluster[0].push_back(data[i]);
}
else if (distance_value[1][i] < distance_value[2][i])
cluster[1].push_back(data[i]);
else
cluster[2].push_back(data[i]);
}
vector<Point> new_center(3);
for (int i = 0; i < 3; i++)
{
float x_aver = 0.;
float y_aver = 0.;
for (int j = 0; j < cluster[i].size(); j++)
{
x_aver += cluster[i][j].x / cluster[i].size();
y_aver += cluster[i][j].y / cluster[i].size();
}
new_center[i].x = x_aver;
new_center[i].y = y_aver;
}
int flag = 0;
cout << "迭代结果:\n";
for (int i = 0; i < 3; i++)
{
if (new_center[i].x == center[i].x && new_center[i].y == center[i].y)
{
flag ++;
}
center[i].x = new_center[i].x;
center[i].y = new_center[i].y;
cout << center[i].x << " " << center[i].y << endl;
}
if (flag == 3)
break;
}*/
return 0;
}
聚类结果为
迭代结果表示在散点图上为,红色点为聚类后的质心