k-means过程:
- 从数据集中随机选取k个点作为初始center
- 迭代(直到一定次数或迭代前后无变化):
- 计算每个向量最近的center,将其归为该类
- 计算每一类向量的质心,作为新的center
在此使用欧氏距离
测试数据集(68040*32):CorelFeatures-mld/ColorHistogram.asc
#include <iostream>
#include <math.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#include <vector>
#include <map>
#include <fstream>
using namespace std;
/**
* @brief 计算欧氏距离
* @param a n维向量
* @param b n维向量
* @return 两点间的欧氏距离
*/
double calcMeas(vector<double> a, vector<double> b){
const int n = a.size();
double p = 0;
for(int i=0; i<n; i++){
p += (a[i]-b[i])*(a[i]-b[i]);
}
return sqrt(p);
}
const int m = 68040;//行,向量个数
const int n = 32;//列,维数
int k = 10;//类别数
int n_calc = 100;//最大迭代次数
int main(){
fstream fs("ColorHistogram.asc");
vector<vector<double> > data;
for(int i=0; i<m; i++){
vector<double> line;
for(int j=0; j<=n; j++){
double t;
fs>>t;
if(j != 0){
line.push_back(t);
}
}
data.push_back(line);
}
fs.close();
cout<<"read data over."<<endl;
//随机选取k个向量作为初始中心
vector<double> center[k];
for(int i=0; i<k; i++){
center[i].resize(n);
int rand_index = rand()%m;
for(int j=0; j<n; j++){
center[i][j] = data[rand_index][j];
}
}
cout<<"rand k-center over."<<endl;
vector<int> belong;//每个向量所属的类别
belong.resize(m);
int pre_num_center[k] = {0};//上次迭代后每个类的向量数
int num_center[k] = {0};//每个类的向量数
for(int nc=0; nc<n_calc; nc++){
cout<<"---------------------------------------"<<endl;
cout<<"calc: "<<nc<<endl;
//根据欧式距离进行分类
for(int i=0; i<m; i++){
belong[i] = 0;
double length = calcMeas(data[i], center[0]);
for(int j=1; j<k; j++){
double meas = calcMeas(data[i], center[j]);
if(meas < length){
length = meas;
belong[i] = j;
}
}
}
cout<<"classify over."<<endl;
//重新计算每个类的质心
vector<double> sum_center[k];
for(int i=0; i<k; i++){
sum_center[i].resize(n);
}
memset(num_center, 0, sizeof(int)*k);
for(int i=0; i<m; i++){
num_center[belong[i]]++;
for(int j=0; j<n; j++){
sum_center[belong[i]][j] += data[i][j];
}
}
for(int i=0; i<k; i++){
for(int j=0; j<n; j++){
center[i][j] = sum_center[i][j]/num_center[i];
}
}
cout<<"recalc center over."<<endl;
//打印
cout<<"num of center:"<<endl;
for(int i=0; i<k; i++){
cout<<i<<"\t"<<num_center[i]<<endl;
}
//是否结束迭代
bool redo = false;
for(int i=0; i<k; i++){
if(pre_num_center[i] != num_center[i]){
redo = true;
}
}
for(int i=0; i<k; i++){
pre_num_center[i] = num_center[i];
}
if(redo == false) break;
}
return 0;
}