K-MEANS算法:
k-means 算法接受输入量 k ;然后将n个数据对象划分为 k个聚类以便使得所获得的聚类满足:同一聚类中的对象相似度较高;而不同聚类中的对象相似度较小。聚类相似度是利用各聚类中对象的均值所获得一个“中心对象”(引力中心)来进行计算的。
k-means 算法的工作过程说明如下:
初始化:聚类数k,初始聚类中心x,迭代次数或者收敛条件。
首先,从n个数据对象任意选择 k 个对象作为初始聚类中心;而对于所剩下其它对象,则根据它们与这些聚类中心的相似度(距离),分别将它们分配给与其最相似的(聚类中心所代表的)聚类;
然后,再计算每个所获新聚类的聚类中心(该聚类中所有对象的均值);
再次,不断重复上面的过程直到满足收敛条件或者迭代次数为止.
目标:各聚类本身尽可能的紧凑,而各聚类之间尽可能的分开.
各种算法实现:
function [cid,nr,centers] = cskmeans(x,k,nc)
% CSKMEANS K-Means clustering - general method.
%
% This implements the more general k-means algorithm, where
% HMEANS is used to find the initial partition and then each
% observation is examined for further improvements in minimizing
% the within-group sum of squares.
%
% [CID,NR,CENTERS] = CSKMEANS(X,K,NC) Performs K-means
% clustering using the data given in X.
%
% INPUTS: X is the n x d matrix of data,
% where each row indicates an observation. K indicates
% the number of desired clusters. NC is a k x d matrix for the
% initial cluster centers. If NC is not specified, then the
% centers will be randomly chosen from the observations.
%
% OUTPUTS: CID provides a set of n indexes indicating cluster
% membership for each point. NR is the number of observations
% in each cluster. CENTERS is a matrix, where each row
% corresponds to a cluster center.
%
% See also CSHMEANS
% W. L. and A. R. Martinez, 9/15/01
% Computational Statistics Toolbox
warning off
[n,d] = size(x);
if nargin < 3
% Then pick some observations to be the cluster centers.
ind = ceil(n*rand(1,k));
% We will add some noise to make it interesting.
nc = x(ind,:) + randn(k,d);
end
% set up storage
% integer 1,...,k indicating cluster membership
cid = zeros(1,n);
% Make this different to get the loop started.
oldcid = ones(1,n);
% The number in each cluster.
nr = zeros(1,k);
% Set up maximum number of iterations.
maxiter = 100;
iter = 1;
while ~isequal(cid,oldcid) & iter < maxiter
% Implement the hmeans algorithm
% For each point, find the distance to all cluster centers
for i = 1:n
dist = sum((repmat(x(i,:),k,1)-nc).^2,2);
[m,ind] = min(dist); % assign it to this cluster center
cid(i) = ind;
end
% Find the new cluster centers
for i = 1:k
% find all points in this cluster
ind = find(cid==i);
% find the centroid
nc(i,:) = mean(x(ind,:));
% Find the number in each cluster;
nr(i) = length(ind);
end
iter = iter + 1;
end
% Now check each observation to see if the error can be minimized some more.
% Loop through all points.
maxiter = 2;
iter = 1;
move = 1;
while iter < maxiter & move ~= 0
move = 0;
% Loop through all points.
for i = 1:n
% find the distance to all cluster centers
dist = sum((repmat(x(i,:),k,1)-nc).^2,2);
r = cid(i); % This is the cluster id for x
%%nr,nr+1;
dadj = nr./(nr+1).*dist'; % All adjusted distances
[m,ind] = min(dadj); % minimum should be the cluster it belongs to
if ind ~= r % if not, then move x
cid(i) = ind;
ic = find(cid == ind);
nc(ind,:) = mean(x(ic,:));
move = 1;
end
end
iter = iter+1;
end
centers = nc;
if move == 0
disp('No points were moved after the initial clustering procedure.')
else
disp('Some points were moved after the initial clustering procedure.')
end
warning on
----------------------------------------------------------------------------
matlab工具箱使用
首先我们装入数据集:kmeansdata
>> load kmeansdata;
>> size(X)
ans =
560 4
数据形式为:
然后调用kmeans();选择k=4,距离采用city block(街区块,只能左右 上下走,不能走对角),默认情况下是欧氏距离。随机选择初始聚类中心。
>> idx4=kmeans(X,4,'distance','city');
返回的是560*1的列向量,表明各行数据所属的类别;
可以通过调用 silhouette()这个函数来观察结果。
>> [silh4,h]= silhouette(X,idx4,'city');
>>xlabel('Silhouette Value')
>>ylabel('Cluster')
具体参见matlab说明
-----------------------------------------------------------------------------
C 语言实现
#include <stdio.h>
#include <math.h>
#define TRUE 1
#define FALSE 0
int N;//数据个数
int K;//集合个数
int * CenterIndex;//初始化质心数组的索引
double * Center;//质心集合
double * CenterCopy;//质心集合副本
double * AllData;//数据集合
double ** Cluster;//簇的集合
int * Top;//集合中元素的个数,也会用作栈处理
//随机生成 k 个数 x(0<=x<=n-1)作为起始的质心集合
void CreateRandomArray(int n, int k,int * center)
{
int i=0;
int j=0;
srand( (unsigned)time( NULL ) );
for( i=0;i<k;++i)//随机生成 k 个数
{
int a=rand()%n;
//判重
for(j=0;j<i;j++)
{
if(center[j]==a)//重复
{
break;
}
}
if(j>=i)//如果不重复 加入
,
{
center[i]=a;
}
else
{
i--;
//如果重复,本次重新随机生成
}
}
}
//返回距离最小的 心的序心
号心 心 质
心
int GetIndex(double value,double * center)
{
int i=0;
int index=i;//最小的质心序号
double min=fabs(value-center[i]);//距质心最小距离
for(i=0;i<K;i++)
{
if(fabs(value-center[i])<min)//如果比当前距离还小,更新最小的质心序号和距离值
{
index=i;
min=fabs(value-center[i]);
}
}
return index;
}
//拷贝质心数组到副本
void CopyCenter()
{
int i=0;
for(i=0;i<K;i++)
{
CenterCopy[i]=Center[i];
}
}
//初始化质心,随机生成法
void InitCenter()
{
int i=0;
CreateRandomArray(N,K,CenterIndex);//产生随机的 K 个<N 的不同的序列
for(i=0;i<K;i++)
{
Center[i]=AllData[CenterIndex[i]];//将对应数据赋值给质心数组
}
CopyCenter();//拷贝到质心副本
}
//加入一个数据到一个 Cluster[index]集合
void AddToCluster(int index,double value)
{
Cluster[index][Top[index]++]=value;//这里同进栈操作
}
//重新计算簇集合
void UpdateCluster()
{
int i=0;
int tindex;
//将所有的集合清空,即将 TOP 置 0
for(i=0;i<K;i++)
{
Top[i]=0;
}
for(i=0;i<N;i++)
{
tindex=GetIndex(AllData[i],Center);//得到与当前数据最小的质心索引
AddToCluster(tindex,AllData[i]); //加入到相应的集合中
}
}
//重新计算质心集合,对每一簇集合中的元素加总求平均即可
void UpdateCenter()
{
int i=0;
int j=0;
double sum=0;
for(i=0;i<K;i++)
{
sum=0;
//计算簇 i 的元素和
for(j=0;j<Top[i];j++)
{
sum+=Cluster[i][j];
}
if(Top[i]>0)//如果该簇元素不为空
{
Center[i]=sum/Top[i];//求其平均值
}
}
}
//判断 2 数组元素是否相等
int IsEqual(double * center1 ,double * center2)
{
int i;
for(i=0;i<K;i++)
{
if(fabs(center1[i]!=center2[i]))
{
return FALSE;
}
}
return TRUE;
}
//打印聚合结果
void Print()
{
int i,j;
printf("-------------------------------------- ");
for(i=0;i<K;i++)
{
printf("第%d 组: 质心(%f)",i,Center[i]);
for(j=0;j<Top[i];j++)
{
printf("%f ",Cluster[i][j]);
}
}
}
//初始化聚类的各种数据
void InitData()
{
int i=0;
int a;
printf("输入数据个数: ");
scanf("%d",&N);
printf("输入簇个数: ");
scanf("%d",&K);
if(K>N)
{
exit(0);
}
Center=(double *)malloc(sizeof(double)*K);//为质心集合申请空间
CenterIndex=(int *)malloc(sizeof(int)*K);//为质心集合索引申请空间
CenterCopy=(double *)malloc(sizeof(double)*K);//为质心集合副本申请空间
Top=(int *)malloc(sizeof(int)*K);
AllData=http://blog.soso.com/qz.q/(double *)malloc(sizeof(double)*N);//为数据集合申请空间
Cluster=(double **)malloc(sizeof(double *)*K);//为簇集合申请空间
//初始化 K 个簇集合
for(i=0;i<K;i++)
{
Cluster[i]=(double *)malloc(sizeof(double)*N);
Top[i]=0;
}
printf("输入%d 数据: ",N);
for(i=0;i<N;i++)
{
scanf("%d",&(a));
AllData[i]=a;
}
InitCenter();//初始化质心集合
UpdateCluster();//初始化 K 个簇集合
}
/*
算法描述:
K 均值算法:
给定类的个数 K,将 N 个对象分到 K 个类中去,
使得类内对象之间的相似性最大,而类之间的相似性最小。
*/
main()
{
int Flag=1;//迭代标志,若为 false,则迭代结束
int i=0;
InitData();//初始化数据
while(Flag)//开始迭代
{
UpdateCluster();//更新各个聚类
UpdateCenter();//更新质心数组
if(IsEqual(Center,CenterCopy))//如果本次迭代与前次的质心聚合相等,即已收敛,结束退出
{
Flag=0;
}
else//否则将质心副本置为本次迭代得到的的质心集合
{
CopyCenter();//将质心副本置为本次迭代得到的的质心集合
}
}
Print();//输出结果
getchar();
getchar();
}
本文引用地址:http://blog.sciencenet.cn/home.php?mod=space&uid=522684&do=blog&id=398625