这篇文章中的K-means是简单的实现,没有调初始质心点和K的值,k的值设为3,初始质心点是测试数据的前三个。
并且没有对数据进行归一化处理,只是用原始数据进行聚类。
下面是代码:
#include <iostream>
#include<sstream>
#include<fstream>
#include<vector>
#include<math.h>
#include<stdlib.h>
#define k 3
using namespace std;
typedef vector<double> Train;
int dataNum;
int dimNum;
int Min(double d1,double d2,double d3){
if(d1<=d2 && d1<=d3)
return 1;
if(d2<=d1 && d2<=d3)
return 2;
if(d3<=d1 && d3<=d2)
return 3;
}
void getMinDistance(vector<Train>& trains,vector<Train>Cen){
double d1=0;
double d2=0;
double d3=0;
for(int i=0;i<dataNum;i++){
for(int j=1;j<dimNum;j++){
d1 += (trains[i][j]-Cen[0][j])*(trains[i][j]-Cen[0][j]);
d2 += (trains[i][j]-Cen[1][j])*(trains[i][j]-Cen[1][j]);
d3 += (trains[i][j]-Cen[2][j])*(trains[i][j]-Cen[2][j]);
}
switch(Min(d1,d2,d3)){
case 1:
trains[i][0]=1; break;
case 2:
trains[i][0]=2; break;
case 3:
trains[i][0]=3; break;
default :
break;
}
d1=0;d2=0;d3=0;
}
}
vector<Train> getNewCenter(vector<Train> trains){
//重新计算质心点
vector<Train> newCenter;
Train centerdata1(dimNum+1,0);
Train centerdata2(dimNum+1,0);
Train centerdata3(dimNum+1,0);
centerdata1[0]=1;
centerdata2[0]=2;
centerdata3[0]=3;
int i1,i2,i3;
i1=0;i2=0;i3=0;
for(int i=0;i<dataNum;i++){
int Cases = (int)(trains[i][0]);
switch(Cases){
case 1:
i1++;
for(int j=1;j<dimNum;j++){
centerdata1[j] += trains[i][j];
}
break;
case 2:
i2++;
for(int j=1;j<dimNum;j++){
centerdata2[j] += trains[i][j];
}
break;
case 3:
i3++;
for(int j=1;j<dimNum;j++){
centerdata3[j] += trains[i][j];
}
break;
default:
break;
}
}
for(int j=1;j<dimNum;j++){
centerdata1[j]=centerdata1[j]/i1;
centerdata2[j]=centerdata2[j]/i2;
centerdata3[j]=centerdata3[j]/i3;
}
newCenter.push_back(centerdata1);
newCenter.push_back(centerdata2);
newCenter.push_back(centerdata3);
return newCenter;
}
vector<Train> Sort(vector<Train>& trains,vector<Train>Cen){
vector<Train>newCenter;
//数据分类
getMinDistance(trains,Cen);
//重新计算质心点
newCenter = getNewCenter(trains);
return newCenter;
}
void KMeans(vector<Train>& trains){
//确定初始质心点 just use the 1,2,3,data(cannot ensure they are randable)
//the first sort
int isNotChange=1;
//init centers
vector<Train>lastCen;
vector<Train>Centers;
for(int i=0;i<3;i++){
// Centers.push_back(trains[i]);
lastCen.push_back(trains[i]);
}
while(isNotChange){
Centers = Sort(trains,lastCen);
if(Centers != lastCen ){
lastCen = Centers;
}
else{
isNotChange = 0;
}
}
cout<<"聚类结果:"<<endl;
for(int i=0;i<dataNum;i++){
for(int j=0;j<=dimNum;j++){
cout<<trains[i][j]<<" ";
}
cout<<endl;
}
}
int main()
{
char fname[256];
cout<<"请输入存放数据的文件名:"<<endl;
cin>>fname;
cout<<"维数:"<<endl;
cin>>dimNum;
cout<<"样本数目:"<<endl;
cin>>dataNum;
ifstream infile(fname);
if(!infile){
cout<<"cannot open the choosen file!"<<fname<<endl;
return 0;
}
vector<Train> trains;
for(int i=0;i<dataNum&&!infile.eof();i++){
string str;
getline(infile,str); //read data from infile to str; only one line
istringstream istr(str); //read data from str to istr;
Train train(dimNum+1,0);
train[0]=i+1;
for(int j=1;j<=dimNum;j++){
istr>>train[j];
}
trains.push_back(train);
}
cout<<"开始聚类"<<endl;
KMeans(trains);
return 0;
}
测试结果如下图:
第一列即为聚类结果,1表示第一类,2表示第二类,3表示第3类
只是最简单的K-means过程,,
如果有好的调K-means的方法,欢迎提出~