K-Means源代码
作者:liangdas
出处:简单点儿,通俗点儿,机器学习 http://write.blog.csdn.net/postedit/25512617
下面是我写的K-Means的源代码,分别有三个文件,一个是K-Means.h头文件,一个是K-Means.c文件,另外一个是Main.cpp文件。K-Means.h和K-Means.c文件中引用了系统的stdio.h,stdlib.h文件,是因为里面用到了printf和exit函数,如果去掉这两个函数,也就可以去掉stdio.h,stdlib.h文件的引用,那么这个K-Means.h和K-Means.c文件就都是用C自己写的啦,的可移植性就更强啦!
后面的main.cpp是介绍怎么使用的,输入是按txt格式存贮的,存贮格式是:
sample number(样本总数)
feature number(特征维数)
intend class number(待分类的类别)
feature list as(特征列表):
feature1 feature2 ...
feature1 feature2 ...
......
当然可以自己定义数据的格式,并重先写LoadPatterns()函数。
K-Means.h
/***********************************
* Author: liangdas
* Time: 20140504
* Version: 0_20140504
* Contaction:
* QQ: 358536026 Email: liangdas1986@163.com
* Working place: Beijing Samsuang Telecom&Technology Institute
************************************/
#ifndef __K_MEAN_H__
#define __K_MEAN_H__
#ifdef __cplusplus
extern "C"{
#endif
#define SUCCESS 1
#define FAILURE 0
#define TRUE 1
#define FALSE 0
#define MAX_DIM 20 //特征维数
#define MAX_SAMPLES 2000 //一个类别包含的样本个数
#define MAX_CLUSTER 10 //最大类别数目
typedef struct stCluster
{
double Center[MAX_DIM];
int Member[MAX_SAMPLES];
int NumMembers;
}CLASSCLUSTER, *PCLASSCLUSTER;
/***************************************************************
* Function: ReClassify()
* Description: 重新计算样本所属的类别
* Input&Output:
* Returns:
****************************************************************/
void ReClassify(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim);
/***************************************************************
* Function: CalcNewClustCenters()
* Description: 重新计算类别中心
* Input&Output:
* Returns:
****************************************************************/
int CalcNewClustCenters(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim);
/***************************************************************
* Function: CalcuDistance()
* Description: 样本序号p,到第c个类别的距离
* Input&Output:
* Returns:
****************************************************************/
double CalcuDistance(int sampleID, int clusterID, double** Pattern, int NumSamples,
PCLASSCLUSTER Cluster, int ClusterNum, int NumDim);
/***************************************************************
* Function: FindClosestCluster()
* Description: 找到样本序号为sampleID的样本,所属的类别
* Input&Output:
* Returns:
****************************************************************/
int FindClosestCluster(int sampleID, double** Pattern, int NumSamples,
PCLASSCLUSTER Cluster, int ClusterNum, int NumDim);
/***************************************************************
* Function: LoadPatterns()
* Descrption: 通过问卷名字,加载样本列表
* Input&OutPut:
* Return:
* File format: 样本数 特征维数 待分类类别数目 样本。。。
****************************************************************/
int LoadPatterns(char *fname, double** Pattern, int* pNumPatterns,
PCLASSCLUSTER Cluster, int* pClusterNum, int* pNumDim);
/***************************************************************
* Function: InitClusters()
* Description: 指定初始类别中心,这个函数取的事样本序列的前ClusterNum
* 个样本作为聚类的初始类别中心
* Input&Output:
* Returns:
****************************************************************/
void InitClusters(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim);
/***************************************************************
* Function: RunKMeans()
* Description: 执行K-Means分类
* Input&Output:
* Returns:
****************************************************************/
void RunKMeans(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim, int nIterTimes);
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类中心的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveCenters(char* pFilePath, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim);
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类的样本的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveClusters(char* pFilePath, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim);
#ifdef __cplusplus
}
#endif
#endif
K-Means.c
/***********************************
* Author: liangdas
* Time: 20140504
* Version: 0_20140504
* Contaction:
* QQ: 358536026 Email: liangdas1986@163.com
* Working place: Beijing Samsuang Telecom&Technology Institute
************************************/
#include <stdlib.h>
#include <io.h>
#include <stdio.h>
#include "K_Means.h"
#ifdef __cplusplus
extern "C"{
#endif
/********************************************************
* Function: LoadPatterns()
* Descrption: 通过问卷名字,加载样本列表
* Input&OutPut:
* Return:
* File format: 样本数 特征维数 待分类类别数目 样本。。。
*********************************************************/
int LoadPatterns(char *fname, double** Pattern, int* pNumPatterns,
PCLASSCLUSTER Cluster, int* pClusterNum, int* pNumDim)
{
FILE* InFilePtr;
int i,j;
double x;
if((InFilePtr = fopen(fname, "rt")) == NULL)
{
return FAILURE;
}
fscanf(InFilePtr, "%d", pNumPatterns); // Read # of patterns
fscanf(InFilePtr, "%d", pNumDim); // Read dimension of vector
fscanf(InFilePtr, "%d", pClusterNum); // Read # of clusters for K-Means
for (i=0; i<*pNumPatterns; i++) // For each vector
{
for (j=0; j<*pNumDim; j++)
{ // create a pattern
fscanf(InFilePtr,"%lg",&x); // consisting of all elements
Pattern[i][j]=x;
} /* endfor */
} /* endfor */
return SUCCESS;
}
/***************************************************************
* Function: InitClusters()
* Description: 指定初始类别中心,这个函数取的事样本序列的前ClusterNum样本作为聚类的初始类别中心
* Input&Output:
* Returns:
****************************************************************/
void InitClusters(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int i,j;
printf("Initial cluster centers:\n");
if(ClusterNum > NumSamples)
{
printf("class number exceed to sample number\n");
}
for (i=0; i<ClusterNum; i++)
{
Cluster[i].Member[0] = i;
for (j=0; j<NumDim; j++)
{
Cluster[i].Center[j] = Pattern[i][j];
} /* endfor */
} /* endfor */
for (i=0; i<ClusterNum; i++)
{
printf("ClusterCenter[%d]=(%f,%f)\n", i, Cluster[i].Center[0], Cluster[i].Center[1]);
} /* endfor */
printf("\n");
}
/***************************************************************
* Function: RunKMeans()
* Description: 执行K-Means分类
* Input&Output:
* Returns:
****************************************************************/
void RunKMeans(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim, int nIterTimes)
{
int nStopTag;
int pass;
int nCurTimes = 0;
nStopTag=FALSE;
while (nStopTag == FALSE)
{
printf("iteration time = %d\n", nCurTimes);
ReClassify(Pattern, NumSamples, Cluster, ClusterNum, NumDim);
nStopTag = CalcNewClustCenters(Pattern, NumSamples, Cluster, ClusterNum, NumDim);
nCurTimes++;
if(nCurTimes>=nIterTimes)
{
nStopTag = TRUE;
}
}
}
/***************************************************************
* Function: CalcuDistance()
* Description: 样本序号sampleID,到第c个类别的距离
* Input&Output:
* Returns:
****************************************************************/
double CalcuDistance(int sampleID, int clusterID, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{ // Calc Euclidean norm of vector difference
double dist,x; // between pattern vector, sampleID, and cluster
int i; // center, clusterID.
char *pnum;
dist = 0;
for (i=0; i<NumDim; i++)
{
x = (Cluster[clusterID].Center[i]-Pattern[sampleID][i])*(Cluster[clusterID].Center[i]-Pattern[sampleID][i]);
dist += (Cluster[clusterID].Center[i]-Pattern[sampleID][i])*(Cluster[clusterID].Center[i]-Pattern[sampleID][i]);
} /* endfor */
return dist;
}
/***************************************************************
* Function: FindClosestCluster()
* Description: 找到样本序号为sampleID的样本,所属的类别
* Input&Output:
* Returns:
****************************************************************/
int FindClosestCluster(int sampleID, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int i, ClustID;
double MinDist, d;
MinDist =9.9e+99;
ClustID=-1;
for (i=0; i<ClusterNum; i++)
{
d=CalcuDistance(sampleID, i, Pattern, NumSamples, Cluster, ClusterNum, NumDim);
if (d<MinDist)
{
MinDist=d;
ClustID=i;
}
}
if (ClustID<0)
{
//printf("Aaargh");
exit(0);
} /* endif */
return ClustID;
}
/***************************************************************
* Function: ReClassify()
* Description: 重新计算样本所属的类别
* Input&Output:
* Returns:
****************************************************************/
void ReClassify(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int i,sampleID,Clustid,MemberIndex;
//Clear membership list for all current clusters
for (i=0; i<ClusterNum; i++)
{
Cluster[i].NumMembers = 0;
}
for (sampleID=0; sampleID<NumSamples; sampleID++)
{
//Find cluster center to which the pattern is closest
Clustid= FindClosestCluster(sampleID, Pattern, NumSamples, Cluster, ClusterNum, NumDim);
MemberIndex=Cluster[Clustid].NumMembers;
Cluster[Clustid].Member[MemberIndex]=sampleID;
Cluster[Clustid].NumMembers++;
} /* endfor */
}
/***************************************************************
* Function: CalcNewClustCenters()
* Description: 重新计算类别中心
* Input&Output:
* Returns:
****************************************************************/
int CalcNewClustCenters(double** Pattern, int NumSamples, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int ConvFlag,VectID,i,j,k;
double tmp[MAX_DIM];
ConvFlag=TRUE;
// printf("The new cluster centers are now calculated as:\n");
for (i=0; i<ClusterNum; i++) //for each cluster
{
for(j=0; j<NumDim; j++)
{ // clear workspace
tmp[j]=0.0;
} /* endfor */
for(j=0; j<Cluster[i].NumMembers; j++)
{ //traverse member vectors
VectID = Cluster[i].Member[j];
for (k=0; k<NumDim; k++)
{ //traverse elements of vector
tmp[k] += Pattern[VectID][k]; // add (member) pattern elmnt into temp
} /* endfor */
} /* endfor */
for (k=0; k<NumDim; k++)
{
tmp[k] = tmp[k]/Cluster[i].NumMembers;
if (tmp[k] != Cluster[i].Center[k])
{
ConvFlag=FALSE;
}
Cluster[i].Center[k]=tmp[k];
} /* endfor */
} /* endfor */
return ConvFlag;
}
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类中心的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveCenters(char* pFilePath, PCLASSCLUSTER Cluster, int ClusterNum, int NumDim)
{
int i,j;
FILE* fpResultFile;
fpResultFile = fopen(pFilePath, "wt");
if(fpResultFile == 0)
{
printf("open file %s error\n", pFilePath);
return;
}
fprintf(fpResultFile, "x\ty\tlabel\t\n");
for (i=0; i<ClusterNum; i++)
{
Cluster[i].Member[0] = i;
for(j=0; j<NumDim; j++)
{
fprintf(fpResultFile, "%f\t", Cluster[i].Center[j]);
}
fprintf(fpResultFile, "%d\n", i);
} /* endfor */
fclose(fpResultFile);
}
/***************************************************************
* Function: SaveClusters()
* Description: 保存聚类的结果
* Input&Output:
* Returns:
****************************************************************/
void SaveClusters(char* pFilePath, double** Pattern, int NumSamples, PCLASSCLUSTER Cluster,
int ClusterNum, int NumDim)
{
FILE* fpResultFile = 0;
int i = 0, j = 0, k = 0;
int nSampleID = 0;
fpResultFile = fopen(pFilePath, "wt");
if(fpResultFile == 0)
{
printf("open file %s error\n", pFilePath);
return;
}
fprintf(fpResultFile, "x\ty\tlabel\t\n");
for (i=0; i<ClusterNum; i++)
{
for(j=0; j<Cluster[i].NumMembers; j++)
{
nSampleID = Cluster[i].Member[j];
for(k=0; k<NumDim; k++)
{
fprintf(fpResultFile, "%hg\t", Pattern[nSampleID][k]);
}
fprintf(fpResultFile, "%d\n", i); //所属的类别
}
} /* endfor */
fclose(fpResultFile);
}
#ifdef __cplusplus
}
#endif
Main.cpp
#include <stdlib.h>
#include <stdio.h>
#include <io.h>
#include <string.h>
#include "K_Means.h"
#define MAX_ITER_TIMES 1000
int main(int argc, char *argv[])
{ //main procedure
//System kmeans;
//double Pattern[MAX_SAMPLES][MAX_DIM+1];
double** Pattern;
CLASSCLUSTER Cluster[MAX_CLUSTER];
int NumSamples; // Number of patterns
int NumDim; // Number of dimensions in vector
int ClusterNum; // Number of clusters
int i = 0;
Pattern = (double**)malloc(sizeof(double*)*MAX_SAMPLES);
for(i=0; i<MAX_SAMPLES; i++)
{
Pattern[i] = (double*)malloc(sizeof(double)*MAX_DIM);
}
char* pFilePath = argv[1];
if (argc<2)
{
printf("usage: intput k_means file\n");
exit(0);
}
if (LoadPatterns(pFilePath, (double**)Pattern, &NumSamples, Cluster, &ClusterNum, &NumDim) == FAILURE)
{
printf("read file %s error\n", pFilePath);
exit(0);
}
//
#if 0
InitClusters((double**)Pattern, NumSamples, Cluster, ClusterNum, NumDim);
#else
Cluster[0].Center[0] = 21;
Cluster[0].Center[1] = 377;
Cluster[1].Center[0] = 20;
Cluster[1].Center[1] = 377;
Cluster[2].Center[0] = 20;
Cluster[2].Center[1] = 376;
#endif
RunKMeans((double**)Pattern, NumSamples, Cluster, ClusterNum, NumDim, MAX_ITER_TIMES);
SaveClusters("cluster.txt", (double**)Pattern, NumSamples, Cluster, ClusterNum, NumDim);
SaveCenters("center.txt", Cluster, ClusterNum, NumDim);
//ShowClusters();
//delete memory
for(i=0; i<MAX_SAMPLES; i++)
{
free(Pattern[i]);
}
free(Pattern);
}
ps:使用或者转载请标明出处,禁止以商业为目的的使用。
如果有需要word版,或者是pdf版的,请与我联系,QQ:358536026