K-means算法是很典型的基于距离的聚类算法,采用距离作为相似性的评价指标,即认为两个对象的距离越近,其相似度就越大。该算法认为簇是由距离靠近的对象组成的,因此把得到紧凑且独立的簇作为最终目标。
算法过程如下:
1)从N个样本随机选取K个样本作为质心
2)对剩余的每个样本测量其到每个质心的距离,并把它归到最近的质心的类
3)重新计算已经得到的各个类的质心
4)迭代2~3步直至新的质心与原质心相等或小于指定阈值,算法结束
#include
#include
#include
#include
#include
#define DIMENSIOM 2 //目前只是处理2维的数据
#define MAX_ROUND_TIME 100 //最大的聚类次数
typedef struct Item{
int dimension_1; //用于存放第一维的数据
int dimension_2; //用于存放第二维的数据
int clusterID; //用于存放该item的cluster center是谁
}Item;
Item* data;
typedef struct ClusterCenter{
double dimension_1;
double dimension_2;
int clusterID;
}ClusterCenter;
ClusterCenter* cluster_center_new;
int isContinue;
int* cluster_center; //记录center
double* distanceFromCenter; //记录一个“点”到所有center的距离
int data_size;
char filename[200];
int cluster_count;
void initial();
void readDataFromFile();
void initial_cluster();
void calculateDistance_ToOneCenter(int itemID, int centerID, int count);
void calculateDistance_ToAllCenter(int itemID);
void partition_forOneItem(int itemID);
void partition_forAllItem_OneCluster(int round);
void calculate_clusterCenter(int round);
void K_means();
void writeClusterDataToFile(int round);
void writeClusterCenterToFile(int round);
void compareNew_OldClusterCenter(double* new_X_Y);
void test_1();
int main(int argc, char* argv[]){
if( argc != 4 )
{
printf("This application need other parameter to run:"
"\n\t\tthe first is the size of data set,"
"\n\t\tthe second is the file name that contain data"
"\n\t\tthe third indicate the cluster_count"
"\n");
exit(0);
}
srand((unsigned)time(NULL));
data_size = atoi(argv[1]);
strcat(filename, argv[2]);
cluster_count = atoi(argv[3]);
initial();
readDataFromFile();
initial_cluster();
//test_1();
//partition_forAllItem_OneCluster();
//calculate_clusterCenter();
K_means();
return 0;
}
/*
* 对涉及到的二维动态数组根据main函数中传入的参数分配空间
* */
void initial(){
data = (Item*)malloc(sizeof(struct Item) * (data_size + 1));
if( !data )
{
printf("malloc error:data!");
exit(0);
}
cluster_center = (int*)