k-mean算法及代码实现

最新推荐文章于 2021-07-28 18:47:39 发布

weixin_34043301

最新推荐文章于 2021-07-28 18:47:39 发布

阅读量161

点赞数

文章标签：人工智能 matlab php

原文链接：http://www.cnblogs.com/C-Hui/archive/2012/03/08/Huihui.html

版权

K-MEANS算法:
k-means 算法接受输入量 k ；然后将n个数据对象划分为 k个聚类以便使得所获得的聚类满足：同一聚类中的对象相似度较高；而不同聚类中的对象相似度较小。聚类相似度是利用各聚类中对象的均值所获得一个“中心对象”（引力中心）来进行计算的。
k-means 算法的工作过程说明如下：
  初始化：聚类数k，初始聚类中心x,迭代次数或者收敛条件。
  首先,从n个数据对象任意选择 k 个对象作为初始聚类中心；而对于所剩下其它对象，则根据它们与这些聚类中心的相似度（距离），分别将它们分配给与其最相似的（聚类中心所代表的）聚类；
  然后,再计算每个所获新聚类的聚类中心（该聚类中所有对象的均值）；
  再次，不断重复上面的过程直到满足收敛条件或者迭代次数为止.
目标：各聚类本身尽可能的紧凑，而各聚类之间尽可能的分开.

各种算法实现：
function [cid,nr,centers] = cskmeans(x,k,nc)
　　% CSKMEANS K-Means clustering - general method.
　　%
　　% This implements the more general k-means algorithm, where
　　% HMEANS is used to find the initial partition and then each
　　% observation is examined for further improvements in minimizing
　　% the within-group sum of squares.
　　%
　　% [CID,NR,CENTERS] = CSKMEANS(X,K,NC) Performs K-means
　　% clustering using the data given in X.
　　%
　　% INPUTS: X is the n x d matrix of data,
　　% where each row indicates an observation. K indicates
　　% the number of desired clusters. NC is a k x d matrix for the
　　% initial cluster centers. If NC is not specified, then the
　　% centers will be randomly chosen from the observations.
　　%
　　% OUTPUTS: CID provides a set of n indexes indicating cluster
　　% membership for each point. NR is the number of observations
　　% in each cluster. CENTERS is a matrix, where each row
　　% corresponds to a cluster center.
　　%
　　% See also CSHMEANS
　　% W. L. and A. R. Martinez, 9/15/01
　　% Computational Statistics Toolbox
　　warning off
　　[n,d] = size(x);
　　if nargin < 3
　　% Then pick some observations to be the cluster centers.
　　ind = ceil(n*rand(1,k));
　　% We will add some noise to make it interesting.
　　nc = x(ind,:) + randn(k,d);
　　end
　　% set up storage
　　% integer 1,...,k indicating cluster membership
　　cid = zeros(1,n);
　　% Make this different to get the loop started.
　　oldcid = ones(1,n);
　　% The number in each cluster.
　　nr = zeros(1,k);
　　% Set up maximum number of iterations.
　　maxiter = 100;
　　iter = 1;
　　while ~isequal(cid,oldcid) & iter < maxiter
　　% Implement the hmeans algorithm
　　% For each point, find the distance to all cluster centers
　　for i = 1:n
　　dist = sum((repmat(x(i,:),k,1)-nc).^2,2);
　　[m,ind] = min(dist); % assign it to this cluster center
　　cid(i) = ind;
　　end
　　% Find the new cluster centers
　　for i = 1:k
　　% find all points in this cluster
　　ind = find(cid==i);
　　% find the centroid
　　nc(i,:) = mean(x(ind,:));
　　% Find the number in each cluster;
　　nr(i) = length(ind);
　　end
　　iter = iter + 1;
　　end
　　% Now check each observation to see if the error can be minimized some more.
　　% Loop through all points.
　　maxiter = 2;
　　iter = 1;
　　move = 1;
　　while iter < maxiter & move ~= 0
　　move = 0;
　　% Loop through all points.
　　for i = 1:n
　　% find the distance to all cluster centers
　　dist = sum((repmat(x(i,:),k,1)-nc).^2,2);
　　r = cid(i); % This is the cluster id for x
　　%%nr,nr+1;
　　dadj = nr./(nr+1).*dist'; % All adjusted distances
　　[m,ind] = min(dadj); % minimum should be the cluster it belongs to
　　if ind ~= r % if not, then move x
　　cid(i) = ind;
　　ic = find(cid == ind);
　　nc(ind,:) = mean(x(ic,:));
　　move = 1;
　　end
　　end
　　iter = iter+1;
　　end
　　centers = nc;
　　if move == 0
　　disp('No points were moved after the initial clustering procedure.')
　　else
　　disp('Some points were moved after the initial clustering procedure.')
　　end
　　warning on
----------------------------------------------------------------------------
matlab工具箱使用
首先我们装入数据集：kmeansdata
>> load kmeansdata;
>> size(X)
ans =
              560     4
  数据形式为：

然后调用kmeans();选择k=4,距离采用city block（街区块，只能左右上下走，不能走对角），默认情况下是欧氏距离。随机选择初始聚类中心。
>> idx4=kmeans(X,4,'distance','city');
返回的是560*1的列向量，表明各行数据所属的类别；
可以通过调用 silhouette（）这个函数来观察结果。
>> [silh4,h]= silhouette(X,idx4,'city');
>>xlabel('Silhouette Value')
>>ylabel('Cluster')
具体参见matlab说明
-----------------------------------------------------------------------------
C 语言实现
#include <stdio.h>
#include <math.h>
#define TRUE 1
#define FALSE 0
int N;//数据个数
int K;//集合个数
int * CenterIndex;//初始化质心数组的索引
double * Center;//质心集合
double * CenterCopy;//质心集合副本
double * AllData;//数据集合
double ** Cluster;//簇的集合
int * Top;//集合中元素的个数,也会用作栈处理
//随机生成 k 个数 x(0<=x<=n-1)作为起始的质心集合
void CreateRandomArray(int n, int k,int * center)
{
int i=0;
int j=0;
srand( (unsigned)time( NULL ) );
for( i=0;i<k;++i)//随机生成 k 个数
{
int a=rand()%n;
//判重
for(j=0;j<i;j++)
{
if(center[j]==a)//重复
{
break;
}
}
if(j>=i)//如果不重复加入
,
{
center[i]=a;
}
else
{
i--;
//如果重复,本次重新随机生成
}
}
}
//返回距离最小的心的序心
号心心质
心
int GetIndex(double value,double * center)
{
int i=0;
int index=i;//最小的质心序号
double min=fabs(value-center[i]);//距质心最小距离
for(i=0;i<K;i++)
{
if(fabs(value-center[i])<min)//如果比当前距离还小,更新最小的质心序号和距离值
{
index=i;
min=fabs(value-center[i]);
}
}
return index;
}
//拷贝质心数组到副本
void CopyCenter()
{
int i=0;
for(i=0;i<K;i++)
{
CenterCopy[i]=Center[i];
}
}
//初始化质心,随机生成法
void InitCenter()
{
int i=0;
CreateRandomArray(N,K,CenterIndex);//产生随机的 K 个<N 的不同的序列
for(i=0;i<K;i++)
{
Center[i]=AllData[CenterIndex[i]];//将对应数据赋值给质心数组
}
CopyCenter();//拷贝到质心副本
}
//加入一个数据到一个 Cluster[index]集合
void AddToCluster(int index,double value)
{
Cluster[index][Top[index]++]=value;//这里同进栈操作
}
//重新计算簇集合
void UpdateCluster()
{
int i=0;
int tindex;
//将所有的集合清空,即将 TOP 置 0
for(i=0;i<K;i++)
{
Top[i]=0;
}
for(i=0;i<N;i++)
{
tindex=GetIndex(AllData[i],Center);//得到与当前数据最小的质心索引
AddToCluster(tindex,AllData[i]); //加入到相应的集合中
}
}
//重新计算质心集合,对每一簇集合中的元素加总求平均即可
void UpdateCenter()
{
int i=0;
int j=0;
double sum=0;
for(i=0;i<K;i++)
{
sum=0;
//计算簇 i 的元素和
for(j=0;j<Top[i];j++)
{
sum+=Cluster[i][j];
}
if(Top[i]>0)//如果该簇元素不为空
{
Center[i]=sum/Top[i];//求其平均值
}
}
}
//判断 2 数组元素是否相等
int IsEqual(double * center1 ,double * center2)
{
int i;
for(i=0;i<K;i++)
{
if(fabs(center1[i]!=center2[i]))
{
return FALSE;
}
}
return TRUE;
}
//打印聚合结果
void Print()
{
int i,j;
printf("-------------------------------------- ");
for(i=0;i<K;i++)
{
printf("第%d 组: 质心(%f)",i,Center[i]);
for(j=0;j<Top[i];j++)
{
printf("%f ",Cluster[i][j]);
}
}
}
//初始化聚类的各种数据
void InitData()
{
int i=0;
int a;
printf("输入数据个数: ");
scanf("%d",&N);
printf("输入簇个数: ");
scanf("%d",&K);
if(K>N)
{
exit(0);
}
Center=(double *)malloc(sizeof(double)*K);//为质心集合申请空间
CenterIndex=(int *)malloc(sizeof(int)*K);//为质心集合索引申请空间
CenterCopy=(double *)malloc(sizeof(double)*K);//为质心集合副本申请空间
Top=(int *)malloc(sizeof(int)*K);
AllData=http://blog.soso.com/qz.q/(double *)malloc(sizeof(double)*N);//为数据集合申请空间
Cluster=(double **)malloc(sizeof(double *)*K);//为簇集合申请空间
//初始化 K 个簇集合
for(i=0;i<K;i++)
{
Cluster[i]=(double *)malloc(sizeof(double)*N);
Top[i]=0;
}
printf("输入%d 数据: ",N);
for(i=0;i<N;i++)
{
scanf("%d",&(a));
AllData[i]=a;
}
InitCenter();//初始化质心集合
UpdateCluster();//初始化 K 个簇集合
}
/*
算法描述:
K 均值算法:
给定类的个数 K,将 N 个对象分到 K 个类中去,
使得类内对象之间的相似性最大,而类之间的相似性最小。
*/
main()
{
int Flag=1;//迭代标志,若为 false,则迭代结束
int i=0;
InitData();//初始化数据
while(Flag)//开始迭代
{
UpdateCluster();//更新各个聚类
UpdateCenter();//更新质心数组
if(IsEqual(Center,CenterCopy))//如果本次迭代与前次的质心聚合相等,即已收敛,结束退出
{
Flag=0;
}
else//否则将质心副本置为本次迭代得到的的质心集合
{
CopyCenter();//将质心副本置为本次迭代得到的的质心集合
}
}
Print();//输出结果
getchar();
getchar();
}