K均值聚类

K均值聚类,K-means clustering

作用:无监督的实现数据分类
操作方式:迭代地将距离较近的点聚集在一起,形成簇,每个簇的数据点聚为一类
过程:1)初始化质心,有几个类就需要用户设定几个质心,好的初始化质心可减少迭代次数;
           2)计算每个点到每个质心的距离(下面实验用的是欧氏距离);
           3)比较每个点到各个质心的距离,距离最小则该点聚类的类标号为该质心的标号(如下图);

                         
           4)这样得到K个簇,计算每个簇的均值,作为该簇的新质心;
           5)若K个新质心与K个原质心相同(没有质心改变),则聚类结束,否则,返回第2步。
以下是C++代码:
#include <iostream>
#include <vector>
#include <fstream>

using namespace std;

struct Point {
	float x = 0.;
	float y = 0.;
};

void loadData(string filename, vector<Point> &data){
	fstream file;
	file.open(filename, std::ios::in);
	if (!file.is_open())
	{
		cout << "ERROR!\n";
		exit(0);
	}
	while (!file.eof())
	{
		Point pt;
		char msg[20];
		memset(msg, 0, 20);
		file >> msg;
		pt.x = atof(msg);

		memset(msg, 0, 20);
		file >> msg;
		pt.y = atof(msg);
		data.push_back(pt);
	}
	file.close();
}

void initCenter(const vector<Point> &data, const int class_num, vector<Point> ¢er){
	center.resize(class_num);
	float max_x = -1000,min_x = 1000., y_aver = 0.;
	for (int i = 0; i < data.size(); i++)
	{
		max_x = max_x > data[i].x ? max_x : data[i].x;
		min_x = min_x < data[i].x ? min_x : data[i].x;
		y_aver += data[i].y / data.size();
	}
	for (int i = 0 ; i < class_num; i++)
	{
		center[i].x = min_x + (max_x - min_x) / class_num * i;
		center[i].y = y_aver;
	}
}

bool centerIsChanged(const vector<Point> &c1, const vector<Point> &c2){
	if (c1.size() != c2.size())
		return true;
	for (int i = 0; i < c1.size(); i++)
	{
		if (c1[i].x != c2[i].x || c1[i].y != c2[i].y)
			return true;
	}
	return false;
}

vector<vector<Point>> kmeansCluster(const vector<Point> &data, vector<Point> ¢er){
	vector<vector<float>> data_distance;
	// 计算各数据点到各个质心的距离
	for (int i = 0; i < center.size(); i++)
	{
		vector<float> distance_i;
		for (int j = 0; j < data.size(); j++)
		{
			distance_i.push_back(
				sqrt(pow(data[j].x - center[i].x, 2) + pow(data[j].y - center[i].y, 2)));
		}
		data_distance.push_back(distance_i);
	}
	vector<vector<Point>> cluster(center.size());//存储每个质心对应的簇
	for (int i = 0; i < data.size(); i++)
	{
		int min_index = 0;
		float min = data_distance[0][i];
		for (int j = 0; j < center.size(); j++)
		{
			if (data_distance[j][i] < min)
			{
				min = data_distance[j][i];
				min_index = j;//得到距离最小的类编号
			}
		}
		cluster[min_index].push_back(data[i]);
	}

	//根据聚类结果更新质心位置
	for (int i = 0; i < center.size(); i++)
	{
		center[i].x = 0;
		center[i].y = 0;
		for (int j = 0; j < cluster[i].size(); j++)
		{
			center[i].x += cluster[i][j].x / cluster[i].size();
			center[i].y += cluster[i][j].y / cluster[i].size();
		}
	}
	return cluster;
}

int main(){
	vector<Point> data;
	data.clear();
	loadData("data.txt", data);
	vector<Point> center(3);
	initCenter(data, 3, center);
	
	while (1)
	{
		vector<Point> center_temp;
		center_temp = center;
		kmeansCluster(data, center);
		if (!centerIsChanged(center, center_temp))
			break;
		cout << "迭代结果:\n";
		for (int i = 0; i < center.size(); i++)
			cout << center[i].x << "   " << center[i].y << endl;
	}

	/*while (1)
	{
		vector<vector<float>> distance_value;
		for (int i = 0; i < 3; i++)
		{
			vector<float> distance;
			for (int j = 0; j < data.size(); j++)
			{
				float dis = sqrt(pow(data[j].x - center[i].x, 2)
					+ pow(data[j].y - center[i].y, 2));
				distance.push_back(dis);
			}
			distance_value.push_back(distance);
		}
		vector<vector<Point>> cluster(3);
		for (int i = 0; i < data.size(); i++)
		{
			if (distance_value[0][i] < distance_value[1][i])
			{
				if (distance_value[0][i] < distance_value[2][i])
					cluster[0].push_back(data[i]);
			}
			else if (distance_value[1][i] < distance_value[2][i])
				cluster[1].push_back(data[i]);
			else
				cluster[2].push_back(data[i]);
		}
		vector<Point> new_center(3);
		for (int i = 0; i < 3; i++)
		{
			float x_aver = 0.;
			float y_aver = 0.;
			for (int j = 0; j < cluster[i].size(); j++)
			{
				x_aver += cluster[i][j].x / cluster[i].size();
				y_aver += cluster[i][j].y / cluster[i].size();
			}
			new_center[i].x = x_aver;
			new_center[i].y = y_aver;
		}
		int flag = 0;
		cout << "迭代结果:\n"; 
		for (int i = 0; i < 3; i++)
		{
			if (new_center[i].x == center[i].x && new_center[i].y == center[i].y)
			{
				flag ++;
			}
			center[i].x = new_center[i].x;
			center[i].y = new_center[i].y;
			cout << center[i].x << "   " << center[i].y << endl;
		}
		if (flag == 3)
			break;
	}*/
	
	return 0;
}
聚类结果为
          
迭代结果表示在散点图上为,红色点为聚类后的质心
           




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值