自己简单实现了最基础的kmeans聚类算法,记录下也就图一乐。
kmeans原理不做赘述,大体就以下几步:
1.随机初始化k个起始中心点;
2.计算所有样本点到这些个中心点的距离,对于单个待测样本点,把它归类成和距离最近的中心点一类;
3.聚类好所有样本点后,对聚到同一类的点再计算他们的中心点,更新中心点;
4.循环2、3两步,直到达到指定的循环次数或者满足退出循环条件时(如每次循环中心点移动距离小于某个值),退出循环,over.
载入数据
从txt文件载入数据,现在用的是之前物体检测的结果,只取前两列xy坐标。
DigitDetcResult_vec2.txt
323 123 13 33 8 0.994399
515 223 15 36 8 0.989735
168 187 13 31 4 0.992918
396 374 9 23 4 0.755749
181 186 12 34 0 0.896224
337 124 13 32 0 0.988561
440 254 13 34 0 0.979571
462 399 11 32 1 0.984626
412 336 12 35 0 0.993743
233 134 13 31 6 0.994085
248 134 13 30 0 0.998754
397 338 12 32 4 0.997557
166 274 13 36 0 0.998852
396 53 17 36 6 0.999508
383 176 11 30 1 0.991233
190 352 12 37 0 0.954582
410 176 13 33 0 0.996125
385 336 11 31 1 0.946704
426 255 13 31 2 0.971079
413 255 11 29 1 0.992365
134 421 15 35 0 0.997429
72 219 15 36 2 0.99865
396 175 12 34 0 0.99587
386 372 9 24 1 0.933619
195 52 14 31 4 0.999961
475 398 13 36 0 0.998914
408 373 11 28 5 0.925378
151 276 13 32 2 0.999217
那么加载数据的函数实现为
void LoadData(string data_path,vector<Point> &dataset)
{
float arr1[64];
float arr2[64];
int i = 0;
ifstream myfile(data_path);
if (!myfile) {
cout << "Unable to open file";
exit(1); // terminate with error
}
else
{
char str[64] = { 0 };
while (!myfile.eof())
{
myfile.getline(str, 64); //读取一行数据
sscanf_s(str, "%f %f", &arr1[i], &arr2[i]);
i++;
}
}
Point tmp;
for (int j = 0; j < i; j++)
{
tmp.x = arr1[j];
tmp.y = arr2[j];
dataset.push_back(tmp);
}
}
把数据点保存在dataset里面。
随机生成中心点
加载完数据之后,随机生成k个中心点,保存在random_centers里面。(k不变时,每次随机生成的都是相同的点 rand产生随机数相同的问题)
void GenerateCenters(int k, vector<Point> dataset, vector<Point> &random_centers)
{
vector<Point>tmp(dataset);
int random_index;
int n = dataset.size();
for (int i = 0; i < k; i++)
{
random_index = rand() % n;
//cout << random_index << endl;
while (tmp.at(random_index).x == -1)//确保取出k个不同点
{
random_index = rand() % n;
}
tmp.at(random_index).x = -1;
random_centers.push_back(dataset.at(random_index));
}
}
计算距离并更新
计算样本点至中心点的距离,根据聚类结果更新中心点
void DoKmeansCluster(vector<Point> dataset, vector<Point> &random_centers)
{
vector<Point> tmp(random_centers);
int point_num = dataset.size();
int k = random_centers.size();
//根据各点到中心点的距离聚类
vector<Kmeans> result;
for (int p = 0; p < point_num; p++)
{
float distance = 9999;
Kmeans res;
for (int q = 0; q < k; q++)
{
float tmp_distance = GetDistance(dataset[p], random_centers[q]);
if (tmp_distance < distance)
{
distance = tmp_distance;
res.label = q;
res.pos = dataset[p];
}
}
result.push_back(res);
}
//根据聚类结果更新中心点
for (int i = 0; i < k; i++)
{
int count = 0;
int sum_x = 0;
int sum_y = 0;
for (int j = 0; j < point_num; j++)
{
if (result[j].label == i)
{
count++;
sum_x += result[j].pos.x;
sum_y += result[j].pos.y;
}
}
random_centers[i].x = sum_x / count;
random_centers[i].y = sum_y / count;
cout << "(" << random_centers[i].x << "," << random_centers[i].y << ")" << endl;
}
if (tmp == random_centers)
stop = true;
}
GetDistance 计算两点间欧几里得距离
float GetDistance(Point p0, Point p1)
{
return sqrt((p0.x - p1.x)*(p0.x - p1.x) + (p0.y - p1.y)*(p0.y - p1.y));
}
完整代码
用opencv显示了一下结果,取k=4。因为cv::Point里x y 都取的int,所以这里判断循环结束条件就直接判断相等就ok了。
#include "pch.h"
#include <opencv2/opencv.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <iostream>
#define CLUSTER_NUMBER 4
using namespace std;
using namespace cv;
struct Kmeans
{
Point pos;
int label;
};
void LoadData(string data_path,vector<Point> &dataset);
void GenerateCenters(int k, vector<Point> dataset, vector<Point> &random_centers);
float GetDistance(Point p0, Point p1);
void DoKmeansCluster(vector<Point> dataset, vector<Point> &random_centers, vector<Kmeans> &result);
bool stop = false;
int main()
{
Mat res(600, 600, CV_8UC3, Scalar(255, 255, 255, 0.5));
vector<Point> dataset,random_centers;
//载入数据
LoadData("DigitDetcResult_vec2.txt", dataset);
//随机生成k个中心点
GenerateCenters(CLUSTER_NUMBER, dataset, random_centers);
vector<Kmeans> result;
while(!stop)
{
DoKmeansCluster(dataset, random_centers, result);
}
vector<Scalar> colors(CLUSTER_NUMBER);
for (int i = 0; i < CLUSTER_NUMBER; i++)
{
colors[i] = Scalar(rand() % 256, rand() % 256, rand() % 256);
for (int j = 0; j < result.size(); j++)
{
if(result[j].label == i)
circle(res, result[j].pos, 4, colors[i], -1);
}
}
imshow("res", res);
waitKey(0);
}
void LoadData(string data_path,vector<Point> &dataset)
{
float arr1[64];
float arr2[64];
int i = 0;
ifstream myfile(data_path);
if (!myfile) {
cout << "Unable to open file";
exit(1); // terminate with error
}
else
{
char str[64] = { 0 };
while (!myfile.eof())
{
myfile.getline(str, 64); //读取一行数据
sscanf_s(str, "%f %f", &arr1[i], &arr2[i]);
i++;
}
}
Point tmp;
for (int j = 0; j < i; j++)
{
tmp.x = arr1[j];
tmp.y = arr2[j];
dataset.push_back(tmp);
}
}
void GenerateCenters(int k, vector<Point> dataset, vector<Point> &random_centers)
{
vector<Point>tmp(dataset);
int random_index;
int n = dataset.size();
for (int i = 0; i < k; i++)
{
random_index = rand() % n;
//cout << random_index << endl;
while (tmp.at(random_index).x == -1)//确保取出k个不同点
{
random_index = rand() % n;
}
tmp.at(random_index).x = -1;
random_centers.push_back(dataset.at(random_index));
}
}
float GetDistance(Point p0, Point p1)
{
return sqrt((p0.x - p1.x)*(p0.x - p1.x) + (p0.y - p1.y)*(p0.y - p1.y));
}
void DoKmeansCluster(vector<Point> dataset, vector<Point> &random_centers, vector<Kmeans> &result)
{
vector<Point> tmp(random_centers);
int point_num = dataset.size();
int k = random_centers.size();
//根据各点到中心点的距离聚类
for (int p = 0; p < point_num; p++)
{
float distance = 9999;
Kmeans res;
for (int q = 0; q < k; q++)
{
float tmp_distance = GetDistance(dataset[p], random_centers[q]);
if (tmp_distance < distance)
{
distance = tmp_distance;
res.label = q;
res.pos = dataset[p];
}
}
result.push_back(res);
}
//根据聚类结果更新中心点
for (int i = 0; i < k; i++)
{
int count = 0;
int sum_x = 0;
int sum_y = 0;
for (int j = 0; j < point_num; j++)
{
if (result[j].label == i)
{
count++;
sum_x += result[j].pos.x;
sum_y += result[j].pos.y;
}
}
random_centers[i].x = sum_x / count;
random_centers[i].y = sum_y / count;
cout << "(" << random_centers[i].x << "," << random_centers[i].y << ")" << endl;
}
if (tmp == random_centers)
stop = true;
}
好了,最基本的kmeans就这样吧,有什么纰漏欢迎指出。