c++实现kmeans

自己简单实现了最基础的kmeans聚类算法,记录下也就图一乐。
kmeans原理不做赘述,大体就以下几步:
1.随机初始化k个起始中心点;
2.计算所有样本点到这些个中心点的距离,对于单个待测样本点,把它归类成和距离最近的中心点一类;
3.聚类好所有样本点后,对聚到同一类的点再计算他们的中心点,更新中心点;
4.循环2、3两步,直到达到指定的循环次数或者满足退出循环条件时(如每次循环中心点移动距离小于某个值),退出循环,over.

载入数据

从txt文件载入数据,现在用的是之前物体检测的结果,只取前两列xy坐标。

DigitDetcResult_vec2.txt
323 123 13 33 8 0.994399
515 223 15 36 8 0.989735
168 187 13 31 4 0.992918
396 374 9 23 4 0.755749
181 186 12 34 0 0.896224
337 124 13 32 0 0.988561
440 254 13 34 0 0.979571
462 399 11 32 1 0.984626
412 336 12 35 0 0.993743
233 134 13 31 6 0.994085
248 134 13 30 0 0.998754
397 338 12 32 4 0.997557
166 274 13 36 0 0.998852
396 53 17 36 6 0.999508
383 176 11 30 1 0.991233
190 352 12 37 0 0.954582
410 176 13 33 0 0.996125
385 336 11 31 1 0.946704
426 255 13 31 2 0.971079
413 255 11 29 1 0.992365
134 421 15 35 0 0.997429
72 219 15 36 2 0.99865
396 175 12 34 0 0.99587
386 372 9 24 1 0.933619
195 52 14 31 4 0.999961
475 398 13 36 0 0.998914
408 373 11 28 5 0.925378
151 276 13 32 2 0.999217

那么加载数据的函数实现为

void LoadData(string data_path,vector<Point> &dataset)
{
	float arr1[64];
	float arr2[64];
	int i = 0;
	ifstream myfile(data_path);
	if (!myfile) {
		cout << "Unable to open file";
		exit(1); // terminate with error  
	}
	else
	{
		char str[64] = { 0 };
		while (!myfile.eof())
		{
			myfile.getline(str, 64); //读取一行数据
			sscanf_s(str, "%f %f", &arr1[i], &arr2[i]);
			i++;
		}
	}
	Point tmp;
	for (int j = 0; j < i; j++)
	{
		tmp.x = arr1[j];
		tmp.y = arr2[j];
		dataset.push_back(tmp);
	}
}

把数据点保存在dataset里面。

随机生成中心点

加载完数据之后,随机生成k个中心点,保存在random_centers里面。(k不变时,每次随机生成的都是相同的点 rand产生随机数相同的问题

void GenerateCenters(int k, vector<Point> dataset, vector<Point> &random_centers)
{
	vector<Point>tmp(dataset);

	int random_index;
	int n = dataset.size();
	for (int i = 0; i < k; i++)
	{
		random_index = rand() % n;
		//cout << random_index << endl;
		while (tmp.at(random_index).x == -1)//确保取出k个不同点
		{
			random_index = rand() % n;
		}
		tmp.at(random_index).x = -1;
		random_centers.push_back(dataset.at(random_index));
	}
}

计算距离并更新

计算样本点至中心点的距离,根据聚类结果更新中心点

void DoKmeansCluster(vector<Point> dataset, vector<Point> &random_centers)
{
	vector<Point> tmp(random_centers);
	int point_num = dataset.size();
	int k = random_centers.size();

	//根据各点到中心点的距离聚类
	vector<Kmeans> result;
	for (int p = 0; p < point_num; p++)
	{
		float distance = 9999;
		Kmeans res;
		for (int q = 0; q < k; q++)
		{
			float tmp_distance = GetDistance(dataset[p], random_centers[q]);
			if (tmp_distance < distance)
			{
				distance = tmp_distance;
				res.label = q;
				res.pos = dataset[p];
			}
		}
		result.push_back(res);
	}

	//根据聚类结果更新中心点
	for (int i = 0; i < k; i++)
	{
		int count = 0;
		int sum_x = 0;
		int sum_y = 0;
		for (int j = 0; j < point_num; j++)
		{
			if (result[j].label == i)
			{
				count++;
				sum_x += result[j].pos.x;
				sum_y += result[j].pos.y;
			}
		}
		random_centers[i].x = sum_x / count;
		random_centers[i].y = sum_y / count;
		cout << "(" << random_centers[i].x << "," << random_centers[i].y << ")" << endl;
	}
	if (tmp == random_centers)
		stop = true;
}

GetDistance 计算两点间欧几里得距离

float GetDistance(Point p0, Point p1)
{
	return sqrt((p0.x - p1.x)*(p0.x - p1.x) + (p0.y - p1.y)*(p0.y - p1.y));
}

完整代码

用opencv显示了一下结果,取k=4。因为cv::Point里x y 都取的int,所以这里判断循环结束条件就直接判断相等就ok了。
在这里插入图片描述

#include "pch.h"
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>


#define CLUSTER_NUMBER 4

using namespace std;
using namespace cv;
struct Kmeans
{
	Point pos;
	int label;
};
void LoadData(string data_path,vector<Point> &dataset);
void GenerateCenters(int k, vector<Point> dataset, vector<Point> &random_centers);
float GetDistance(Point p0, Point p1);
void DoKmeansCluster(vector<Point> dataset, vector<Point> &random_centers, vector<Kmeans> &result);
bool stop = false;

int main()
{
	Mat res(600, 600, CV_8UC3, Scalar(255, 255, 255, 0.5));
	vector<Point> dataset,random_centers;
	//载入数据
	LoadData("DigitDetcResult_vec2.txt", dataset);
	
	//随机生成k个中心点
	GenerateCenters(CLUSTER_NUMBER, dataset, random_centers);
	vector<Kmeans> result;
	while(!stop)
	{
		DoKmeansCluster(dataset, random_centers, result);
	}
	vector<Scalar> colors(CLUSTER_NUMBER);
	for (int i = 0; i < CLUSTER_NUMBER; i++)
	{
		colors[i] = Scalar(rand() % 256, rand() % 256, rand() % 256);
		for (int j = 0; j < result.size(); j++)
		{
			if(result[j].label == i)
				circle(res, result[j].pos, 4, colors[i], -1);
		}
	}
	imshow("res", res);
	waitKey(0);
}

void LoadData(string data_path,vector<Point> &dataset)
{
	float arr1[64];
	float arr2[64];
	int i = 0;
	ifstream myfile(data_path);
	if (!myfile) {
		cout << "Unable to open file";
		exit(1); // terminate with error  
	}
	else
	{
		char str[64] = { 0 };
		while (!myfile.eof())
		{
			myfile.getline(str, 64); //读取一行数据
			sscanf_s(str, "%f %f", &arr1[i], &arr2[i]);
			i++;
		}
	}
	Point tmp;
	for (int j = 0; j < i; j++)
	{
		tmp.x = arr1[j];
		tmp.y = arr2[j];
		dataset.push_back(tmp);
	}
}

void GenerateCenters(int k, vector<Point> dataset, vector<Point> &random_centers)
{
	vector<Point>tmp(dataset);

	int random_index;
	int n = dataset.size();
	for (int i = 0; i < k; i++)
	{
		random_index = rand() % n;
		//cout << random_index << endl;
		while (tmp.at(random_index).x == -1)//确保取出k个不同点
		{
			random_index = rand() % n;
		}
		tmp.at(random_index).x = -1;
		random_centers.push_back(dataset.at(random_index));
	}
}

float GetDistance(Point p0, Point p1)
{
	return sqrt((p0.x - p1.x)*(p0.x - p1.x) + (p0.y - p1.y)*(p0.y - p1.y));
}

void DoKmeansCluster(vector<Point> dataset, vector<Point> &random_centers, vector<Kmeans> &result)
{
	vector<Point> tmp(random_centers);
	int point_num = dataset.size();
	int k = random_centers.size();

	//根据各点到中心点的距离聚类
	for (int p = 0; p < point_num; p++)
	{
		float distance = 9999;
		Kmeans res;
		for (int q = 0; q < k; q++)
		{
			float tmp_distance = GetDistance(dataset[p], random_centers[q]);
			if (tmp_distance < distance)
			{
				distance = tmp_distance;
				res.label = q;
				res.pos = dataset[p];
			}
		}
		result.push_back(res);
	}

	//根据聚类结果更新中心点
	for (int i = 0; i < k; i++)
	{
		int count = 0;
		int sum_x = 0;
		int sum_y = 0;
		for (int j = 0; j < point_num; j++)
		{
			if (result[j].label == i)
			{
				count++;
				sum_x += result[j].pos.x;
				sum_y += result[j].pos.y;
			}
		}
		random_centers[i].x = sum_x / count;
		random_centers[i].y = sum_y / count;
		cout << "(" << random_centers[i].x << "," << random_centers[i].y << ")" << endl;
	}
	if (tmp == random_centers)
		stop = true;
}

好了,最基本的kmeans就这样吧,有什么纰漏欢迎指出。

  • 2
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值