这里写自定义目录标题
C++实现Kmeans算法
最近在做一个医学影像处理相关的项目,用到了聚类算法,记录一下
聚类算法Kmeans
算法会将数据集分为 K 个簇,每个簇使用簇内所有样本均值来表示,将该均值称为“质心”:
优点:原理简单,容易实现;聚类速度快;
缺点:易受初始质心的影响;算法可能收敛到局部最优或不收敛;
改进
选择初始质心:初始的聚类中心之间的相互距离要尽可能的远。
1、从样本中选择 1 个点作为初始质心(完全随机)
2、对于任意一个非质心样本 x,计算x与现有最近质心距离 D(x)
3、基于距离计算概率,来选择下一个质心 x,选择距离当前质心远的点作为质心
4、重复步骤 2 与 3 ,直到选择 k 个质心为止。
Kmeans.h
#pragma once
#ifndef K_MEANS_H
#define K_MEANS_H
#include<vector>
using namespace std;
struct Tuple
{
float x;
float y;
float z;
};//The latitude of a cluster object is defined as three dimensions, and larger dimensions can be represented by dynamic arrays
class K_means
{
private:
vector<Tuple> basic_data;//initialization data
int mean_number;//k number
vector<vector<Tuple>> result_data;//{ mean_number };//The resulting data is also used to store intermediate clustering data
vector<Tuple> means;//The cluster center of the intermediate computation store
vector<float> mea_square_error;//Mean square deviation of each cluster
public:
K_means(vector<Tuple> basic_data, int mean_number) : result_data(mean_number), mea_square_error(mean_number)
{
this->basic_data = basic_data;
this->mean_number = mean_number;
}
//Calculate the final clustering result
vector<Tuple> get_fina_result_data();
private:
//Threshold selection
//vector<Tuple> threshold_selection(int x, int y, int z);
//Calculate the distance between two points
float get_distxyz(Tuple tuple1, Tuple tuple2);
//Initialize k cluster centers
void prime_k_meansbase();
//Initialize to get K clusters
void get_k_means();
//Get the current class center of each cluster
void get_basemean();
//Get new cluster
void get_new_k_means();
//You get the variance of each of these clusters
void get_k_mean_erro();
};
#endif
Kmeans.cpp
#include <io.h>
#include<algorithm>
#include<iostream>
#include<vector>
#include<cmath>
#include"kmeans.h"
#include <string>
#include <map>
#include<sstream>
#include <string>
using namespace std;
#define FLT_MAXX 3.402823466e+38F
float K_means::get_distxyz(Tuple tuple1, Tuple tuple2) //Calculate the Euclidean distance between points
{
int distance = pow(pow((tuple1.x - tuple2.x), 2) + pow((tuple1.y -
tuple2.y), 2) + pow((tuple1.z - tuple2.z), 2), 0.5);
return distance;
}
//Initialize k center points
void K_means::prime_k_meansbase()
{
struct timeb timeSeed;
ftime(&timeSeed);
srand(timeSeed.time * 1000 + timeSeed.millitm);
//srand((unsigned)time(0));
int max_size = basic_data.size();
int rand1 = rand() % max_size;
cout << "index0: " << rand1 << endl;
means.push_back(basic_data[rand1]);
for (int t = 1; t != mean_number; t++)
{
int c_size = t;
float maxs = 0;
int index = -1;
for (int i = 0; i < basic_data.size(); i++) {
float sum = 0;
for (int j = 0; j < c_size; j++) {
sum += pow(pow((basic_data[i].x - means[j].x), 2) + pow((basic_data[i].y - means[j].y), 2)
+ pow((basic_data[i].z - means[j].z), 2), 0.5);
}
if (sum > maxs) {
maxs = sum;
index = i;
}
}
//cout << "index: " << index << " ";
means.push_back(basic_data[index]);
}
}
//get k clusters
void K_means::get_k_means()
{
for (int i = 0; i != basic_data.size(); ++i)
{
float min_distance = 0x3f3f3f3f;
int static temp;
for (int j = 0; j != mean_number; ++j)
{
if (min_distance > get_distxyz(basic_data[i], means[j]))
{
min_distance = get_distxyz(basic_data[i], means[j]);
temp = j;
//cout << temp << endl;;
}
}
result_data[temp].push_back(basic_data[i]);
}
}
//Compute the center of the cluster
void K_means::get_basemean()
{
vector<Tuple> A_mean;
for (int j = 0; j < mean_number; j++)
{
float sumx = 0;
float sumy = 0;
float sumz = 0;
int num = result_data[j].size();
A_mean = result_data[j];
//cout << num << endl;
for (int i = 0; i < num; i++)
{
sumx += A_mean[i].x;
sumy += A_mean[i].y;
sumz += A_mean[i].z;
}
Tuple basemean = { sumx / num,sumy / num,sumz / num };
means[j] = basemean;
}
}
//Recalculated cluster
void K_means::get_new_k_means()
{
result_data.clear();
result_data = vector<vector<Tuple>>(mean_number);
for (int i = 0; i != basic_data.size(); ++i)
{
float min_distance = 0x3f3f3f3f;
int static temp2;
for (int j = 0; j != mean_number; ++j)
{
if (min_distance > get_distxyz(basic_data[i], means[j]))
{
min_distance = get_distxyz(basic_data[i], means[j]);
temp2 = j;
}
}
result_data[temp2].push_back(basic_data[i]);
}
}
//Calculate the variance
void K_means::get_k_mean_erro()
{
vector<float> square_error;
for (int j = 0; j < mean_number; j++)
{
float sum = 0;
int num = result_data[j].size();
vector<Tuple> A_mean = result_data[j];
Tuple basemean = means[j];
for (int i = 0; i < num; i++)
{
sum += pow(get_distxyz(A_mean[i], basemean), 2);
}
float result = sum / num;
mea_square_error[j] = result;
}
}
//Calculate the final result
vector<Tuple> K_means::get_fina_result_data()
{
//Initialize
prime_k_meansbase();
get_k_means();
get_k_mean_erro();
int i = 1;
int number = 0;
int temp = -1;
vector<Tuple> appr_mean;
while (i)
{
number++;
vector<float> last_k_mean_erro = mea_square_error;
get_basemean();
get_new_k_means();
get_k_mean_erro();
int count_noerror = 0;
if (number == 2000) {
LOG4CXX_DEBUG(VBNLogger, "K-means Algorithm failure");
return appr_mean;
}
else if (number %200 == 0) {
means.clear();
prime_k_meansbase();
get_k_means();
get_k_mean_erro();
cout << "number:" << number << endl;
}
else{
if (mea_square_error == last_k_mean_erro) {
i = 0;
for (int j = 0; j != mean_number; ++j)
{
if (last_k_mean_erro[j] < 100) {
count_noerror++;
}
}
if (count_noerror != mean_number) {
if (temp < count_noerror) {
appr_mean = means;
//temp = count_noerror;
}
temp = count_noerror;
means.clear();
/*result_data.clear();
mea_square_error.clear();*/
prime_k_meansbase();
get_k_means();
get_k_mean_erro();
i = 1;
}
}
}
}
return means;
}