以下记录本人使用VS2008对重庆某运营商WAP网关采集到的日志文件统计后的信息进行K-means算法
#include "stdafx.h"
#include<stdio.h>#include<iostream>
#include<string>
#include<math.h>
#include<fstream>
#include<vector>
using namespace std;
struct Person //定义用户结构体,包括电话号码,在线时长,一周会话数,点击量,上行流量,下行流量及聚类的号
{
string calling_num;
int duration;
int sessNum;
int clicks;
int uplink;
int downlink;
int updownlink;
int classid;
};
struct Clusterr //每个类的中心点信息
{
int duration;
int sessNum;
int clicks;
int uplink;
int downlink;
//float updownlink;
int classid;
};
void fileWrite(char *file,vector<Person> &persons)
{
string c;
Person person;
ifstream sourcefile(file,ios::in);
while(getline(sourcefile,c))
{
int temp[7]; int j=0;
for(size_t i=0;i<c.size();i++)
{
if(c[i]==' ')
{
temp[j++]=i;
}
}
person.calling_num=c.substr(0,temp[0]-1);
person.duration=atoi(c.substr(temp[0]+1,temp[1]-temp[0]-1).c_str());
person.sessNum=atoi(c.substr(temp[1]+1,temp[2]-temp[1]-1).c_str());
person.clicks=atoi(c.substr(temp[2]+1,temp[3]-temp[2]-1).c_str());
person.uplink=atof(c.substr(temp[3]+1,temp[4]-temp[3]-1).c_str())/1000;
person.downlink=atof(c.substr(temp[4]+1,temp[5]-temp[4]-1).c_str())/1000;
person.classid=0;
persons.push_back(person);
}
}
void initCenter(vector<Person> &persons,vector<Clusterr> &clusterCenter,int k)
{
Clusterr temp;
for(size_t i=0;i<k;i++)
{
temp.classid=i+1; //initialize k clusters' center
temp.clicks=persons[i].clicks;
temp.downlink=persons[i].downlink;
temp.duration=persons[i].duration;
temp.sessNum=persons[i].sessNum;
temp.uplink=persons[i].uplink;
//temp.updownlink=float(persons[i].updownlink);
clusterCenter.push_back(temp);
}
}
float dist(Person personer,Clusterr center) //使用的是欧几里德距离进行计算
{
float dis;
float cc=abs(personer.clicks-center.clicks)+abs(personer.duration-center.duration)+abs(personer.uplink-center.uplink)+abs(personer.downlink-center.downlink)+abs(personer.sessNum-center.sessNum);
dis = sqrt(cc);
return dis;
}
int diss(Clusterr a,Clusterr b)
{
int cc; float jj;
jj=abs(a.clicks-b.clicks)+abs(a.duration-b.duration)+abs(a.sessNum-b.sessNum)+abs(a.uplink-b.uplink);
cc=(int)sqrt(jj);
return cc;
}
void k_means(vector<Person> &persons,vector<Clusterr> &clusterCenter,int temp)
{
float distance=dist(persons[temp],clusterCenter[0]);
float dis;
persons[temp].classid=clusterCenter[0].classid;
for(size_t t=1;t<clusterCenter.size();t++)
{
dis=dist(persons[temp],clusterCenter[t]);
if(dis<distance)
{
persons[temp].classid=clusterCenter[t].classid;
distance=dis;
}
}
}
int findSame(vector<Clusterr> past,vector<Clusterr> nowt)
{
int findtemp=1;
int du=2;
if (diss(past[0],nowt[0])<du&&diss(past[1],nowt[1])&&diss(past[2],nowt[2])<du&&diss(past[3],nowt[3])<du)
findtemp=0;
return findtemp;
}
int centerUpdate(vector<Person> persons,vector<Clusterr> &clusterCenter,int k)
{
int *classNum;
int sett;
vector<Clusterr> pre;
//Cluster past;
classNum=new int[k+1];
for(int ii=0;ii<k+1;ii++)
classNum[ii]=1;
for(size_t kk=0;kk<clusterCenter.size();kk++) //用于保存各类的原中心位置信息
pre.push_back(clusterCenter[kk]);
for(size_t i=0;i<persons.size();i++) //更新了类的中心位置信息
for(int j=1;j<k+1;j++)
if(persons[i].classid == j)
{
classNum[j]+=1;
/*
*clusterCenter[j-1].clicks=(clusterCenter[j-1].clicks+persons[i].clicks/(classNum[j]-1))/(1+1/(classNum[j]-1));
*clusterCenter[j-1].duration=(clusterCenter[j-1].duration+persons[i].duration/(classNum[j]-1))/(1+1/(classNum[j]-1));
*clusterCenter[j-1].uplink=(clusterCenter[j-1].uplink+persons[i].uplink/(classNum[j]-1))/(1+1/(classNum[j]-1));
*clusterCenter[j-1].sessNum=(clusterCenter[j-1].sessNum+persons[i].sessNum/(classNum[j]-1))/(1+1/(classNum[j]-1));
*clusterCenter[j-1].downlink=(clusterCenter[j-1].downlink+persons[i].downlink/(classNum[j]-1))/(1+1/(classNum[j]-1));
*clusterCenter[j-1].updownlink=(clusterCenter[j-1].updownlink+persons[i].updownlink/(classNum[j]-1))/(1+1/(classNum[j]-1));
*/
clusterCenter[j-1].clicks+=persons[i].clicks;
clusterCenter[j-1].duration+=persons[i].duration;
clusterCenter[j-1].downlink+=persons[i].downlink;
clusterCenter[j-1].uplink+=persons[i].uplink;
clusterCenter[j-1].sessNum+=persons[i].sessNum;
}
for(int j=0;j<k;j++)
{
clusterCenter[j].clicks=(int)(clusterCenter[j].clicks/classNum[j+1]);
clusterCenter[j].downlink/=classNum[j+1];
clusterCenter[j].uplink/=classNum[j+1]; clusterCenter[j].duration/=classNum[j+1];
clusterCenter[j].sessNum/=classNum[j+1];
}
sett=findSame(pre,clusterCenter);
pre.clear();
return sett;
}
int main()
{
unsigned int k;
char file[100];
string c; int js=1;
int stepnum; int cstep=0;
vector<Person> persons;vector<Clusterr> clusterCenter;
string fileName="data30m.txt"; //可以改动保存用户信息的文件名
fstream ioFile1,ioFile2,ioFile3; //output file stream
char *newfile1="F:\\Visual C++\\Cpp project\\example for C++\\k_means\\user_center.txt";
char *newfile2="F:\\Visual C++\\Cpp project\\example for C++\\k_means\\user_information.txt";
char *newfile3="F:\\Visual C++\\Cpp project\\example for C++\\k_means\\center_info.txt";
sprintf(file,"F:\\Visual C++\\Cpp project\\example for C++\\k_means\\%s",fileName.c_str()); //找到要处理的文件
cout<<"请输入你想聚成几类(按Enter结束):";
cin>>k; cout<<endl;
cout<<"请输入你允许聚类最多迭代的次数(按Enter结束)"<<endl;
cin>>stepnum; cout<<endl;
fileWrite(file,persons); //STL initialize,however it may out of memory
int times=1;
initCenter(persons,clusterCenter,k);
while((cstep<stepnum)&&js)
{
for(size_t i=0;i<persons.size();i++)
{
k_means(persons,clusterCenter,i);
}
js=centerUpdate(persons,clusterCenter,k); //update the center of every class by steps
cstep++;
cout<<"第"<<times++<<"次迭代"<<endl;
}
ioFile1.open(newfile1,ios::out|ios::app);
ioFile3.open(newfile2,ios::out|ios::app);
for(size_t i=0;i<persons.size();i++)
ioFile1<<persons[i].calling_num<<" "<<persons[i].classid<<endl;
for(size_t i=0;i<clusterCenter.size();i++)
ioFile3<<clusterCenter[i].classid<<" "<<clusterCenter[i].duration<<" "<<clusterCenter[i].clicks<<" "<<clusterCenter[i].sessNum<<" "<<clusterCenter[i].uplink<<" "<<clusterCenter[i].downlink<<endl;
}