c语言生成分组聚类图像,【转】kmeans算法c语言实现，能对不同维度的数据进行聚类...

最新推荐文章于 2023-05-15 16:59:33 发布

ffts0721

最新推荐文章于 2023-05-15 16:59:33 发布

阅读量387

点赞数

文章标签： c语言生成分组聚类图像

最近在苦于思考kmeans算法的MPI并行化，花了两天的时间先把串行版本实现了。

聚类问题就是给定一个元素集合V，其中每个元素具有d个可观察属性，使用某种算法将V划分成k个子集，要求每个子集内部的元素之间相异度尽可能低，而不同子集的元素相异度尽可能高。

下面是google到该算法的一个流程图，表意清楚：

1、随机选取数据集中的k个数据点作为初始的聚类中心：

2、计算每个数据点对应的最短距离的聚类中心：：

3、利用目前得到的聚类重新计算中心点：

4、重复步骤2和3直到收敛(达到最大迭代次数或聚类中心移动距离极小)：

code:

1 #include

3 #include

4 #include

5 #include

7 int K,N,D; //聚类的数目，数据量，数据的维数

8 float **data; //存放数据

9 int *in_cluster; //标记每个点属于哪个聚类

10 float **cluster_center; //存放每个聚类的中心点

12 float **array(int m,int n);

13 void freearray(float **p);

14 float **loadData(int *k,int *d,int *n);

15 float getDistance(float avector[],float bvector[],int n);

16 void cluster();

17 float getDifference();

18 void getCenter(int in_cluster[N]);

20 int main()

21 {

22 int i,j,count=0;

23 float temp1,temp2;

24 data=loadData(&K,&D,&N);

25 printf("Data sets:\n");

26 for(i=0;i

27 for(j=0;j

28 printf("%-8.2f",data[i][j]);

29 if((j+1)%D==0) putchar('\n');

30 }

31 printf("-----------------------------\n");

33 srand((unsigned int)(time(NULL))); //随机初始化k个中心点

34 for(i=0;i

35 for(j=0;j

36 cluster_center[i][j]=data[(int)(N*rand()/(RAND_MAX+1.0))][j];

38 cluster(); //用随机k个中心点进行聚类

39 temp1=getDifference(); //第一次中心点和所属数据点的距离之和

40 count++;

41 printf("The difference between data and center is: %.2f\n\n", temp1);

43 getCenter(in_cluster);

44 cluster(); //用新的k个中心点进行第二次聚类

45 temp2=getDifference();

46 count++;

47 printf("The difference between data and center is: %.2f\n\n",temp2);

49 while(fabs(temp2-temp1)!=0){ //比较前后两次迭代，若不相等继续迭代

50 temp1=temp2;

51 getCenter(in_cluster);

52 cluster();

53 temp2=getDifference();

54 count++;

55 printf("The %dth difference between data and center is: %.2f\n\n",count,temp2);

56 }

58 printf("The total number of cluster is: %d\n\n",count); //统计迭代次数

59 system("pause");

60 return 0;

61 }

64 //动态创建二维数组

65 float **array(int m,int n)

66 {

67 float **p;

68 p=(float **)malloc(m*sizeof(float *));

69 p[0]=(float *)malloc(m*n*sizeof(float));

70 for(int i=1;i

71 return p;

72 }

74 //释放二维数组所占用的内存

75 void freearray(float **p)

76 {

77 free(*p);

78 free(p);

79 }

81 //从data.txt导入数据，要求首行格式：K=聚类数目,D=数据维度,N=数据量

82 float **loadData(int *k,int *d,int *n)

83 {

84 float **arraydata;

85 FILE *fp;

86 if((fp=fopen("data.txt","r"))==NULL) fprintf(stderr,"cannot open data.txt!\n");

87 if(fscanf(fp,"K=%d,D=%d,N=%d\n",k,d,n)!=3) fprintf(stderr,"load error!\n");

88 arraydata=array(*n,D); //生成数据数组

89 cluster_center=array(*k,D); //聚类的中心点

90 in_cluster=(int *)malloc(*n * sizeof(int)); //每个数据点所属聚类的标志数组

91 for(int i=0;i

92 for(int j=0;j

93 fscanf(fp,"%f",&arraydata[i][j]); //读取数据点

94 return arraydata;

95 }

97 //计算欧几里得距离

98 float getDistance(float avector[],float bvector[],int n)

99 {

100 int i;

101 float sum=0.0;

102 for(i=0;i

103 sum+=pow(avector[i]-bvector[i],2);

104 return sqrt(sum);

105 }

106

107 //把N个数据点聚类，标出每个点属于哪个聚类

108 void cluster()

109 {

110 int i,j;

111 float min;

112 float **distance=array(N,K); //存放每个数据点到每个中心点的距离

113 //float distance[N][K]; //也可使用C99变长数组

114 for(i=0;i

115 min=9999.0;

116 for(j=0;j

117 distance[i][j] = getDistance(data[i],cluster_center[j],D);

118 //printf("%f\n", distance[i][j]);

119 if(distance[i][j]

120 min=distance[i][j];

121 in_cluster[i]=j;

122 }

123 }

124 printf("data[%d] in cluster-%d\n",i,in_cluster[i]+1);

125 }

126 printf("-----------------------------\n");

127 free(distance);

128 }

129

130 //计算所有聚类的中心点与其数据点的距离之和

131 float getDifference()

132 {

133 int i,j;

134 float sum=0.0;

135 for(i=0;i

136 for(j=0;j

137 if(i==in_cluster[j])

138 sum+=getDistance(data[j],cluster_center[i],D);

139 }

140 }

141 return sum;

142 }

143

144 //计算每个聚类的中心点

145 void getCenter(int in_cluster[N])

146 {

147 float **sum=array(K,D); //存放每个聚类中心点

148 //float sum[K][D]; //也可使用C99变长数组

149 int i,j,q,count;

150 for(i=0;i

151 for(j=0;j

152 sum[i][j]=0.0;

153 for(i=0;i

154 count=0; //统计属于某个聚类内的所有数据点

155 for(j=0;j

156 if(i==in_cluster[j]){

157 for(q=0;q

158 sum[i][q]+=data[j][q]; //计算所属聚类的所有数据点的相应维数之和

159 count++;

160 }

161 }

162 for(q=0;q

163 cluster_center[i][q]=sum[i][q]/count;

164 }

165 printf("The new center of cluster is:\n");

166 for(i = 0; i < K; i++)

167 for(q=0;q

168 printf("%-8.2f",cluster_center[i][q]);

169 if((q+1)%D==0) putchar('\n');

170 }

171 free(sum);

172 }

该程序支持不同维度的数据集，一个示例的数据集: data.txt如下：

K=3,D=3,N=15

-25 22.2 35.34

31.2 -14.4 23

32.02 -23 24.44

-25.35 36.3 -33.34

-20.2 27.333 -28.22

-15.66 17.33 -23.33

26.3 -31.34 16.3

-22.544 16.2 -32.22

12.2 -15.22 22.11

-41.241 25.232 -35.338

-22.22 45.22 23.55

-34.22 50.14 30.98

15.23 -30.11 20.987

-32.5 15.3 -25.22

-38.97 20.11 33.22

ffts0721

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
c语言生成分组聚类图像,【转】kmeans算法c语言实现，能对不同维度的数据进行聚类...

最近在苦于思考kmeans算法的MPI并行化，花了两天的时间先把串行版本实现了。聚类问题就是给定一个元素集合V，其中每个元素具有d个可观察属性，使用某种算法将V划分成k个子集，要求每个子集内部的元素之间相异度尽可能低，而不同子集的元素相异度尽可能高。下面是google到该算法的一个流程图，表意清楚：1、随机选取数据集中的k个数据点作为初始的聚类中心：2、计算每个数据点对应的最短距离的聚类中心：：3...
复制链接

扫一扫