K-means基本原理
0.预设聚类类别数k的值;
示例一:
#include <iostream>
#include <vector>
#include<stdlib.h> // rand
#include<time.h> // time
int main()
{
/* 待聚类数据 */
std::vector<std::pair<int, int>> data{
std::pair<int,int>(-1,-2),std::pair<int,int>(2,1),
std::pair<int,int>(2,2), std::pair<int,int>(9,9),
std::pair<int,int>(10,10),std::pair<int,int>(8,11) };
/* 预设类别数k为2 */
const int kCluster = 2;
/* 随机选择两个数据为初始聚类中心 */
srand((int)time(0));
int rand1 = 0 + (int)(data.size() * rand() / (RAND_MAX + 1.0));
int rand2 = 0 + (int)(data.size() * rand() / (RAND_MAX + 1.0));
std::pair<int, int> center1 = data[rand1]; // 随机数rand1和rand2相同即两个类别的初始聚类中心相同不影响最终结果
std::pair<int, int> center2 = data[rand2];
/* cluster1和cluster2用于存储当前轮聚类结果;tmp1和tmp2用于存储上一轮聚类结果
此处预设聚类终止条件是:当前轮聚类结果和上一轮聚类结果相同 */
std::vector<std::pair<int, int>> cluster1, tmp1;
std::vector<std::pair<int, int>> cluster2, tmp2;
/* 样本点与聚类中心的距离 */
double dist1 = 0.0;
double dist2 = 0.0;
while (true)
{
tmp1 = cluster1;
tmp2 = cluster2;
cluster1.clear();
cluster2.clear();
for (int i = 0; i < data.size(); ++i)
{
/* 计算样本点与聚类中心的距离 */
dist1 = sqrt(pow(data[i].first - center1.first, 2) +
pow(data[i].second - center1.second, 2));
dist2 = sqrt(pow(data[i].first - center2.first, 2) +
pow(data[i].second - center2.second, 2));
/* 样本点离哪个聚类中心近则认为属于哪个类 */
if (dist1 > dist2)
{
cluster2.push_back(data[i]);
}
else
{
cluster1.push_back(data[i]);
}
}
/* 达到迭代终止条件 */
if (tmp1 == cluster1 && tmp2 == cluster2)
{
break;
}
/* 更新cluster1的聚类中心 */
int sum_x = 0;
int sum_y = 0;
for (const auto &it : cluster1)
{
sum_x += it.first;
sum_y += it.second;
}
if (cluster1.size())
{
center1.first = sum_x / cluster1.size();
center1.second = sum_y / cluster1.size();
}
/* 更新cluster2的聚类中心 */
sum_x = 0;
sum_y = 0;
for (const auto &it : cluster2)
{
sum_x += it.first;
sum_y += it.second;
}
if (cluster2.size())
{
center2.first = sum_x / cluster2.size();
center2.second = sum_y / cluster2.size();
}
}
/* 打印聚类结果 */
std::cout << "cluster1:" << std::endl;
for (const auto &it : cluster1)
{
std::cout << "(" << it.first << "," << it.second << ")" << std::endl;
}
std::cout << "\ncluster2:" << std::endl;
for (const auto &it : cluster2)
{
std::cout << "(" << it.first << "," << it.second << ")" << std::endl;
}
return 0;
}
示例2:
#include <iostream>
#include <vector>
#include <array>
#include <algorithm> // std::equal
#include <boost/random.hpp>
int main()
{
/* 待聚类数据 */
std::vector<std::pair<int, int>> data{
std::pair<int,int>(-1,-2),std::pair<int,int>(2,1),
std::pair<int,int>(2,2), std::pair<int,int>(9,9),
std::pair<int,int>(10,10),std::pair<int,int>(8,11) };
/* 随机聚类中心 */
boost::random::mt19937 gen(std::time(0));
boost::random::uniform_real_distribution<> dist(-100.0, 100.0);;
const int kClusterNum = 2;
std::vector<std::vector<std::pair<int, int>>> clusters(kClusterNum);
std::vector<std::vector<std::pair<int, int>>> hist_clusters(kClusterNum);
std::array<std::pair<float, float>, kClusterNum> centers{
std::pair<float, float>(dist(gen), dist(gen)), std::pair<float, float>(dist(gen), dist(gen)) };
/* 实施聚类 */
while (1)
{
for (int i = 0; i < data.size(); ++i)
{
float min_dist = INT_MAX;
int idx_min_dist = 0;
// 对于data[i]寻找与其距离最近的聚类中心,并将data[i]放到该聚类中心对应的类
for (int j = 0; j < kClusterNum; ++j)
{
float dist = sqrt(pow(data[i].first - centers[j].first, 2) + pow(data[i].second - centers[j].second, 2));
if (dist < min_dist)
{
min_dist = dist;
idx_min_dist = j;
}
}
clusters[idx_min_dist].push_back(data[i]);
}
// 更新聚类中心
for (int m = 0; m < kClusterNum; ++m)
{
float sum_0 = 0.f;
float sum_1 = 0.f;
for (int n = 0; n < clusters[m].size(); ++n)
{
sum_0 += clusters[m][n].first;
sum_1 += clusters[m][n].second;
}
centers[m] = std::pair<float, float>(sum_0 / std::max(clusters[m].size(),(size_t)1), sum_1 / std::max(clusters[m].size(),(size_t)1));
}
// 如果此次聚类的结果与上一次聚类的结果相同则终止
if (std::equal(clusters.begin(), clusters.end(), hist_clusters.begin()))
{
std::cout << "Clustering finished." << std::endl;
break;
}
// 将此次聚类的结果赋给hist_clusters
hist_clusters = clusters;
// 清空此次聚类结果
for (int m = 0; m < clusters.size(); ++m)
{
clusters[m].clear();
}
}
/* 打印聚类结果 */
for (int i = 0; i < kClusterNum; ++i)
{
std::cout << "cluster " << i << ": " << std::endl;
for (int j = 0; j < clusters[i].size(); ++j)
{
std::cout << "(" << clusters[i][j].first << "," << clusters[i][j].second << ")" << ", ";
}
std::cout << std::endl;
}
return 0;
}