层次聚类算法之single-linkage和complete-linkage(C语言实现)

层次聚类试图在不同层次上对数据集合进行划分, 从而形成树形的聚类结构。数据集的划分可采用“自底向上”的聚合策略,也可以采用“自顶向下”的分拆策略。

AGNES是一种采用自底向上的聚合策略的层次聚合算法,它先将数据集中的每个样本看作是一个初始的聚类簇,然后在算法进行的每一步中找出距离最近的两个聚类来进行合并,该过程不断的重复,直到到达预设的聚类簇的个数。

改算法的关键是如何计算聚类之间的距离, 实际上,每一个聚类是一个样本的集合,因此,只需要采用关于集合的某种距离即可。

最近距离由两个簇的最近的样本来决定, 最大距离由两个簇的最远的样本来决定,由此分别产生的AGNES算法又分别称为single-linkage和complete-linkage算法。


single-linkage算法

#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<string.h>
#include<io.h>

int sample_size;
int sample_dimension;
char filename[200];

double** data;
double** distance;
int cluster_count;

void initialization();
void readDataFromFile();
double calculateDistance_BetweenTwoObject(int, int);
void initializationDistanceMatrix();
void AGNES();
void findMinDistance_BetweenClsuter_MIN(int*, int*);
void combineCluster(int, int);
void writeToFile(int);


int main(int argc, char* argv[])
{
	if( argc != 4 )
	{
		printf("This algorithm requires 4 user-specified parameter"
				"\n\t\tthe number of sample"
				"\n\t\tthe dimension of sample"
				"\n\t\tthe filename contain the sample"
				"\n\n");
		exit(0);
	}
	sample_size = atoi(argv[1]);
	sample_dimension = atoi(argv[2]);
	strcat(filename, argv[3]);

	initialization();
	readDataFromFile();
	initializationDistanceMatrix();

	AGNES();

	return 0;
}

/*
 * initialize the dynamic array for storing sample
 * */
void initialization()
{
	//initializaion the sample data
	int i, j;
	data = (double**)malloc(sizeof(double*) * (sample_size + 1));
	if( !data )
	{
		printf("data malloc error: 0");
		exit(0);
	}
	for( i = 1; i <= sample_size; i++ )
	{
		data[i] = (double*)malloc(sizeof(double) * (sample_dimension + 1));
		if( !data[i] )
		{
			printf("data malloc error: %d", i);
			exit(0);
		}
	}

	//initialiation the distance data
	distance = (double**)malloc(sizeof(double*) * (sample_size + 1));
	if( !distance )
	{
		printf("distance malloc error: 0");
		exit(0);
	}
	for( i = 1; i <= sample_size; i++ )
	{
		distance[i] = (double*)malloc(sizeof(double) * (sample_size + 1));
		if( !distance[i] )
		{
			printf("distance malloc error: %d", i);
			exit(0);
		}
	}

	//the 0th element of all row indicate the clauterID of the object
	for( i = 1; i <= sample_size; i++ )
	{
		distance[i][0] = i;
	}
}

/*
 * read the sample data from file
 * */
void readDataFromFile()
{
	FILE* fread;
	int i;
	int j;
	if( NULL == (fread = fopen(filename, "r")))
	{
		printf("open file(%s) error: ", filename);
		exit(0);
	}
	for( i = 1; i <= sample_size; i++ )
	{
		for( j = 1; j <= sample_dimension; j++ )
		{
			if( 1 != fscanf(fread, "%lf ", &data[i][j]))
			{
				printf("fscanf error: (%d, %d)", i, j);
				exit(0);
			}
		}
	}

	//test
	printf("print the origin data:\n");
	for( i = 1; i <= sample_size; i++ )
	{
		for( j = 1; j <= sample_dimension; j++ )
		{
			printf("%f\t", data[i][j]);
		}
		printf("\n");
	}
	//test END
}

/*
 * calculate distance between two objects
 * */
double calculateDistance_BetweenTwoObject(int firstID, int secondID)
{
	double distance = 0.0;
	int i;
	for( i = 1; i <= sample_dimension; i++ )
	{
		distance = distance + pow(data[firstID][i] - data[secondID][i], 2);
	}
	return sqrt(distance);
}

/*
 * calculate initialization distance matrix
 * */
void initializationDistanceMatrix()
{
	int i, j;
	for( i = 1; i <= sample_size; i++ )
	{
		for( j = i; j <= sample_size; j++ )
		{
			distance[i][j] = calculateDistance_BetweenTwoObject(i, j);
			distance[j][i] = distance[i][j];
		}
	}
	
	//test
	printf("print the origin distance matrix\n");
	for( i = 1; i <= sample_size; i++ )
	{
		for( j = 0; j <= sample_size; j++ )
		{
			printf("%f ", distance[i][j]);
		}
		printf("\n");
	}
	//test END
}

/***************************************************************************************************************
 *						AGNES
 ***************************************************************************************************************/
void AGNES()
{
	cluster_count = sample_size;
	int cluster_1, cluster_2;
	int count = 1;
	while( cluster_count > 3 )
	{
		printf("-------------------%d--------------------\n", count);
		findMinDistance_BetweenClsuter_MIN(&cluster_1, &cluster_2);
		combineCluster(cluster_1, cluster_2);				
		cluster_count--;
		writeToFile(count++);
	}
}

/*
 * find two clusters the distance between them is minimun, and store the clusterID in @cluster_1 and @cluster_2, respectively.
 * */
void findMinDistance_BetweenClsuter_MIN(int* cluster_1, int* cluster_2)
{
	int i, j;
	double min_distance;
	int flag = 1;
	for( i = 1; i <= sample_size; i++ )
	{
		for( j = i + 1; j <= sample_size; j++ )
		{
			if( distance[i][0] == distance[j][0] )
			{
				printf("same cluster!!!! %d and %d\n\n", i , j);
				continue;
			}
			else if( flag == 1 )
			{
				min_distance = distance[i][j];
				*cluster_1 = i;
				*cluster_2 = j;
				flag = 0;
				printf("----get the initial minimum distance is %f(%d, %d)\n", min_distance, i, j);
				continue;
			}
			if( distance[i][j] < min_distance )
			{
				min_distance = distance[i][j];
				*cluster_1 = i;
				*cluster_2 = j;
			}
		}
	}
	printf("the minimum is %f :(%d, %d)\n", min_distance, *cluster_1, *cluster_2);
}

/*
 * combine the two clusters to one
 * */
void combineCluster(int cluster_1, int cluster_2)
{
	int i;
	int buffer;
	buffer = distance[cluster_2][0];
	for( i = 1; i <= sample_size; i++ )
	{
		if( distance[i][0] == buffer )
		{
			distance[i][0] = distance[cluster_1][0];
		}
	}

	//test
	printf(" the minimum distance is %f, and the object is %d and %d\n", distance[cluster_1][cluster_2], cluster_1, cluster_2);
	int j;
	for( i = 1; i <= sample_size; i++ )
	{
		for( j = 0; j <= sample_size; j++ )
		{
			printf("%f ", distance[i][j]);
		}
		printf("\n");
	}
}

/*
 * write the result of clustering information to file
 * */
void writeToFile(int round)
{
	int i;
	int j;
	int* auxiliary = (int*)malloc(sizeof(int) * (sample_size + 1));
	if( !auxiliary )
	{
		printf("auxiliary malloc error");
		exit(0);
	}
	for( i = 1; i <= sample_size; i++ )
		auxiliary[i] = 0;
	for( i = 1; i <= sample_size; i++ )
	{
		auxiliary[(int)distance[i][0]]++;
	}
	int *clusterID = (int*)malloc(sizeof(int) * (sample_size - round + 1));
	if( !clusterID )
	{
		printf("clusterID malloc error");
		exit(0);
	}
	int counter = 1;
	for( i = 1; i <= sample_size; i++ )
	{
		if( auxiliary[i] != 0 )
			clusterID[counter++] = i;
	}
	//test
	printf("cluster ID:");
	for( i = 1; i <= sample_size - round; i++ )
	{
		printf("%d ", clusterID[i]);
	}
	printf("\n");
	//test END

	printf("writeToFile -----------round: %d\n", round);
	FILE** fwrite;
	fwrite = (FILE**)malloc(sizeof(FILE*) * (sample_size + 1));
	if( !fwrite )
	{
		printf("fwrite malloc error\n");
		exit(0);
	}

	char filename[200] = "";
	char instruction[200] = "";
	sprintf(instruction, "md  Round_%d", round);
	system(instruction);
	
	for( i = 1; i <= sample_size - round; i++ )
	{
		sprintf(filename, ".//Round_%d//cluster_%d.data", round, clusterID[i]);
		if( NULL == (fwrite[clusterID[i]] = fopen(filename, "w")))
		{
			printf("open file(%s) error\n", filename);
			exit(0);
		}
	}
	for( i = 1; i <= sample_size; i++ )
	{
		printf("%d ", (int)distance[i][0]);
		for( j = 1; j <= sample_dimension; j++ )
		{
			fprintf(fwrite[(int)distance[i][0]], "%f\t", data[i][j]);
			printf("%f ", data[i][j]);
		}
		fprintf(fwrite[(int)distance[i][0]], "\n");
		printf("\n");
	}
	
	for( i = 1; i <= sample_size - round; i++ )
	{
		fclose(fwrite[clusterID[i]]);
	}
}

complete-linkage算法

#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<string.h>
#include<io.h>

int sample_size;
int sample_dimension;
char filename[200];

double** data;
double** distance;
int cluster_count;

void initialization();
void readDataFromFile();
double calculateDistance_BetweenTwoObject(int, int);
void initializationDistanceMatrix();
void findMinDistance_BetweenClsuter_MAX(int*, int*);
void AGNES();
void combineCluster(int, int);
void updateDistance(int);
void writeToFile(int);


int main(int argc, char* argv[])
{
	if( argc != 4 )
	{
		printf("This algorithm requires 4 user-specified parameter"
				"\n\t\tthe number of sample"
				"\n\t\tthe dimension of sample"
				"\n\t\tthe filename contain the sample"
				"\n\n");
		exit(0);
	}
	sample_size = atoi(argv[1]);
	sample_dimension = atoi(argv[2]);
	strcat(filename, argv[3]);

	initialization();
	readDataFromFile();
	initializationDistanceMatrix();
	AGNES();
	return 0;
}

/*
 * initialize the dynamic array for storing sample
 * */
void initialization()
{
	//initializaion the sample data
	int i, j;
	data = (double**)malloc(sizeof(double*) * (sample_size + 1));
	if( !data )
	{
		printf("data malloc error: 0");
		exit(0);
	}
	for( i = 1; i <= sample_size; i++ )
	{
		data[i] = (double*)malloc(sizeof(double) * (sample_dimension + 1));
		if( !data[i] )
		{
			printf("data malloc error: %d", i);
			exit(0);
		}
	}

	//initialiation the distance data
	distance = (double**)malloc(sizeof(double*) * (sample_size + 1));
	if( !distance )
	{
		printf("distance malloc error: 0");
		exit(0);
	}
	for( i = 0; i <= sample_size; i++ )
	{
		distance[i] = (double*)malloc(sizeof(double) * (sample_size + 1));
		if( !distance[i] )
		{
			printf("distance malloc error: %d", i);
			exit(0);
		}
	}

	//the 0th element of all row indicate the clauterID of the object
	for( i = 1; i <= sample_size; i++ )
	{
		distance[i][0] = i;
	}
	//where reserved if distance[0][i] = 1, 
	for( i = 1; i <= sample_size; i++ )
	{
		distance[0][i] = 1;
	}
}

/*
 * read the sample data from file
 * */
void readDataFromFile()
{
	FILE* fread;
	int i;
	int j;
	if( NULL == (fread = fopen(filename, "r")))
	{
		printf("open file(%s) error: ", filename);
		exit(0);
	}
	for( i = 1; i <= sample_size; i++ )
	{
		for( j = 1; j <= sample_dimension; j++ )
		{
			if( 1 != fscanf(fread, "%lf ", &data[i][j]))
			{
				printf("fscanf error: (%d, %d)", i, j);
				exit(0);
			}
		}
	}
}

/*
 * calculate distance between two objects
 * */
double calculateDistance_BetweenTwoObject(int firstID, int secondID)
{
	double distance = 0.0;
	int i;
	for( i = 1; i <= sample_dimension; i++ )
	{
		distance = distance + pow(data[firstID][i] - data[secondID][i], 2);
	}
	return sqrt(distance);
}

/*
 * calculate initialization distance matrix
 * */
void initializationDistanceMatrix()
{
	int i, j;
	for( i = 1; i <= sample_size; i++ )
	{
		for( j = i; j <= sample_size; j++ )
		{
			distance[i][j] = calculateDistance_BetweenTwoObject(i, j);
			distance[j][i] = distance[i][j];
		}
	}
}

/***************************************************************************************************************
 *						AGNES
 ***************************************************************************************************************/
void AGNES()
{
	cluster_count = sample_size;
	int cluster_1, cluster_2;
	int count = 1;
	while( cluster_count > 3 )
	{
		printf("-------------------%d--------------------\n", count);
		findMinDistance_BetweenClsuter_MAX(&cluster_1, &cluster_2);
		combineCluster(cluster_1, cluster_2);
		updateDistance(cluster_1);		
		cluster_count--;
		writeToFile(count++);
	}
}

/*
 * find two clusters the distance between them is minimun, and store the clusterID in @cluster_1 and @cluster_2, respectively.
 * */
void findMinDistance_BetweenClsuter_MAX(int* cluster_1, int* cluster_2)
{
	int i, j;
	double min_distance;
	int flag = 1;
	int object_1;
	int object_2;
	for( i = 1; i <= sample_size; i++ )
	{
		for( j = i + 1; j <= sample_size; j++ )
		{
			if( distance[i][0] == distance[j][0] || ( distance[0][i] == 0 || distance[0][j] == 0 ) )
			{
				continue;
			}
			else if( flag == 1 )
			{
				min_distance = distance[i][j];
				object_1 = i;
				object_2 = j;
				flag = 0;
				continue;
			}
			if( distance[i][j] < min_distance )
			{
				min_distance = distance[i][j];
				object_1 = i;
				object_2 = j;
			}
		}
	}
	*cluster_1 = distance[object_1][0];
	*cluster_2 = distance[object_2][0];
}

/*
 * combine the two clusters to one
 * */
void combineCluster(int cluster_1, int cluster_2)
{
	int i;
	for( i = 1; i <= sample_size; i++ )
	{
		if( distance[i][0] == cluster_2 )
		{
			distance[i][0] = cluster_1;
		}
	}
}

/*
 * update distance data
 * */
void updateDistance(int clusterID)
{
	int i, j;
	int flag = 1;
	int save_i;
	for( i = 1; i <= sample_size; i++ )
	{
		if( distance[i][0] == clusterID && flag == 1 )
		{
			flag = 0;
			save_i = i;
			continue;
		}
		else if( distance[i][0] == clusterID && flag == 0 )
		{
			distance[0][i] = 0;
			for( j = 1; j <= sample_size; j++ )
			{
				if( distance[i][j] > distance[save_i][j] )
				{
					distance[save_i][j] = distance[i][j];
				}
			}
		}
	}
	for( i = 1; i <= sample_size; i++ )
	{
		distance[i][save_i] = distance[save_i][i];
	}
}

/*
 * write the result of clustering information to file
 * */
void writeToFile(int round)
{
	int i;
	int j;
	int* auxiliary = (int*)malloc(sizeof(int) * (sample_size + 1));
	if( !auxiliary )
	{
		printf("auxiliary malloc error");
		exit(0);
	}
	for( i = 1; i <= sample_size; i++ )
		auxiliary[i] = 0;
	for( i = 1; i <= sample_size; i++ )
	{
		auxiliary[(int)distance[i][0]]++;
	}
	int *clusterID = (int*)malloc(sizeof(int) * (sample_size - round + 1));
	if( !clusterID )
	{
		printf("clusterID malloc error");
		exit(0);
	}
	int counter = 1;
	for( i = 1; i <= sample_size; i++ )
	{
		if( auxiliary[i] != 0 )
			clusterID[counter++] = i;
	}

	printf("writeToFile -----------round: %d\n", round);
	FILE** fwrite;
	fwrite = (FILE**)malloc(sizeof(FILE*) * (sample_size + 1));
	if( !fwrite )
	{
		printf("fwrite malloc error\n");
		exit(0);
	}

	char filename[200] = "";
	char instruction[200] = "";
	sprintf(instruction, "md  Round_%d", round);
	system(instruction);
	
	for( i = 1; i <= sample_size - round; i++ )
	{
		sprintf(filename, ".//Round_%d//cluster_%d.data", round, clusterID[i]);
		if( NULL == (fwrite[clusterID[i]] = fopen(filename, "w")))
		{
			printf("open file(%s) error\n", filename);
			exit(0);
		}
	}
	for( i = 1; i <= sample_size; i++ )
	{
		printf("%d ", (int)distance[i][0]);
		for( j = 1; j <= sample_dimension; j++ )
		{
			fprintf(fwrite[(int)distance[i][0]], "%f\t", data[i][j]);
		}
		fprintf(fwrite[(int)distance[i][0]], "\n");
	}
	
	for( i = 1; i <= sample_size - round; i++ )
	{
		fclose(fwrite[clusterID[i]]);
	}
}



  • 6
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值