关闭

K-means算法(基于MovieLens数据分别对user和movie聚类)

标签: K-means算法对多维数据聚类MovieLens数据集C语言算法
1392人阅读 评论(3) 收藏 举报
分类:

本代码对高维的数据使用K-means算法进行聚类。使用的数据集是MovieLens。MovieLens中一共包含十万条记录,每一条记录分别由用户ID,电影ID,用户对电影的评分(1~5),以及日期组成。本代码使用评分作为用户或者是电影的特征向量分别对电影和用户进行聚类。聚类的结果通过代码写入文件中。在代码中通过选择运行第107或108行来决定针对用户还是电影进行聚类,其他的代码不变。

<span style="font-family:Courier New;">#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<time.h>
#include<math.h>

#define MAX_ROUND 100

typedef struct Item
{
	int itemID;
	int attributeID;
	int attributeValue;
}Item;
Item* data;

int data_size;
char filename[200];
int cluster_count;

int item_size;
int attribute_size;
double squaredError;
int isContinue;

double* distanceToCenter;
int** adjacentMatrix;
double** cluster_center;

void initial();
void readData();
void statisticNumberOfItem_Attribute();
void initial_afterStatistic();
void generateOriginalClusterCenter();
void calculateDistance_ToOneCenter(int, int);
void calculateDistance_ToAllCenter(int);
void partition_forOneItem(int, int);
void partition_forAllItem_OneCluster(int);
void writeToFile_oneCluster(int);
void calculate_clusterCenter(int);
void compareNewAndOldClusterCenter(double**);
void kmeans();

int main(int argc, char* argv[])
{
	if( argc != 4 )
	{
		printf("This application need 3 paremeter to run"
				"\n\t\tthe first to indicate the size of data"
				"\n\t\tthe second to indicate the filename contain data"
				"\n\t\tthe third to indicate the size of cluster K\n");
		exit(0);
	}
	data_size = atoi(argv[1]);
	strcat(filename, argv[2]);
	cluster_count = atoi(argv[3]);
	printf("%d\t%s\t%d\n", data_size, filename, cluster_count);

	initial();
	readData();
	statisticNumberOfItem_Attribute();
	initial_afterStatistic();
	generateOriginalClusterCenter();

	//partition_forAllItem_OneCluster(1);
	//calculate_clusterCenter(1);

	kmeans();
	return 0;
}

/*
 * initialization:
 * 	allocate arrays dynamically use function of malloc
 * */
void initial()
{
	data = (Item*)malloc(sizeof(struct Item) * (data_size + 1));
	if( !data )
	{
		printf("data malloc error!\n");
		exit(0);
	}

	distanceToCenter = (double*)malloc(sizeof(double) * (cluster_count + 1));
	if( !distanceToCenter )
	{
		printf("distanceTocenter malloc error");
		exit(0);
	}
}

/*
 * read the original rating data
 * */
void readData()
{
	FILE* fread;
	if( NULL == (fread = fopen(filename, "r")))
	{
		printf("open file error: %s\n", filename);
		exit(0);
	}
	int i;
	for( i = 1; i <= data_size; i++ )
	{
		if( 3 != fscanf(fread, "%d\t%d\t%d", &data[i].itemID, &data[i].attributeID, &data[i].attributeValue)) //For user
		//if( 3 != fscanf(fread, "%d\t%d\t%d", &data[i].attributeID, &data[i].itemID, &data[i].attributeValue))	//For movie
		{
			printf("fscanf error: %d\n", i);
			exit(0);
		}
	}
	//test
	/*
	for( i = 1; i <= data_size; i++ )
	{
		printf("%d\t%d\t%d\n", data[i].itemID, data[i].attributeID, data[i].attributeValue);
	}
	//test END
	*/
	fclose(fread);
}


/*
 * statistic the total number of item and attribute, respectively.
 * statistic the degree of item and attribute
 * initialization the adjacentMatrix
 * */
void statisticNumberOfItem_Attribute()
{
	int i;
	int maxItemID = data[1].itemID, maxAttributeID = data[1].attributeID;
	for( i = 2; i <= data_size; i++ )
	{
		if( data[i].itemID > maxItemID )
			maxItemID = data[i].itemID;
		if( data[i].attributeID > maxAttributeID )
			maxAttributeID = data[i].attributeID;
	}
	item_size = maxItemID;
	attribute_size = maxAttributeID;
	printf("maxItemID and maxAttributeID is %d %d\n", maxItemID, maxAttributeID);

	int* item = (int*)malloc(sizeof(int) * (maxItemID + 1));	//statistic degree of item
	if( !item )
	{
		printf("item malloc error");
		exit(0);
	}
	for( i = 1; i <= maxItemID; i++) 
		item[i] = 0;
	int* attribute = (int*)malloc(sizeof(int) * (maxAttributeID + 1));	//statistic degree of attribute
	if( !attribute )
	{
		printf("attribute malloc error");
		exit(0);
	}
	for( i = 1; i <= maxAttributeID; i++ )
		attribute[i] = 0;

	//statistic the degree of each item and attribute
	for( i = 1; i <= data_size; i++ )
	{
		item[data[i].itemID]++;
		attribute[data[i].attributeID]++;
	}

	//statistic the itemID and attributeID which degree equal to 0 and number of item and attribute, respectively.
	int item_count_degree0 = 0, attribute_count_degree0 = 0;
	for( i = 1; i <= maxItemID; i++ )
		if( item[i] == 0 )
			item_count_degree0++;
	for( i = 1; i <= maxAttributeID; i++ )
		if( attribute[i] == 0 )
			attribute_count_degree0++;
	printf("the number of item and attribute that degree equal 0 is %d, %d\n", item_count_degree0, attribute_count_degree0);
	printf("the number of item and attribute that have attributeValue data is %d, %d\n", maxItemID - item_count_degree0, maxAttributeID - attribute_count_degree0);
	
	//test
	/*
	printf("show the degree of item and attribute:\n");
	for( i = 1; i <= maxItemID; i++ )
		printf("%d ", item[i]);
	printf("\n");
	for( i = 1; i <= maxAttributeID; i++ )
		printf("%d ", attribute[i]);
	printf("\n");	
	*/
}

/*
 * according to the result of statistic to initialization other struct
 * */
void initial_afterStatistic()
{	
	/*
	 * initialization the adjacent matrix
	 * */
	int i, j;
	adjacentMatrix = (int**)malloc(sizeof(int*) * (item_size + 1));
	if( !adjacentMatrix )
	{
		printf("adjacent malloc error: 0");
		exit(0);
	}
	for( i = 0; i <= item_size; i++ )	
	{
		adjacentMatrix[i] = (int*)malloc(sizeof(int) * (attribute_size + 1));
		if( !adjacentMatrix[i] )
		{
			printf("adjacent malloc error: %d", i);
			exit(0);
		}
	}
	//initialization the adjacent matrix is 0
	for( i = 0; i <= item_size; i++ )
		for( j = 0; j <= attribute_size; j++ )
			adjacentMatrix[i][j] = 0;

	for( i = 1; i <= data_size; i++ )
	{
		adjacentMatrix[data[i].itemID][data[i].attributeID] = data[i].attributeValue;
	}

	//test
	/*
	printf("show adjacent matrix:\n");
	for( i = 1; i <= item_size; i++ )
	{
		for( j = 1; j <= attribute_size; j++ )
			printf("%3d", adjacentMatrix[i][j]);
		printf("\n");
	}
	*/
	//test EDN
	
	/*
	 * initialization the cluster center 
	 * 	a row represents the all @attributeValue of a @itemID, the value of index_0, i.e cluster_center[i][0], is denote the 
	 * 	cluster of item is belong to, which range from 1 to cluster_count.  
	 * */
	cluster_center = (double**)malloc(sizeof(double*) * (cluster_count + 1));
	if( !cluster_center )
	{
		printf("cluster center malloc error: 0");
		exit(0);
	}
	for( i = 1; i <= cluster_count; i++ )
	{
		cluster_center[i] = (double*)malloc(sizeof(double) * (attribute_size + 1));
		if( !cluster_center[i] )
		{
			printf("cluster_center malloe error: %d", i);
			exit(0);
		}
	}
}

/*
 * 	generate the initial @cluster_count cluster center of item randomly
 * */
void generateOriginalClusterCenter()
{
	int i, j;
	int* auxiliary;
	int item_length_temp = item_size;
	int random_index;
	auxiliary = (int*)malloc(sizeof(int) * (item_size + 1));
	if( !auxiliary )
	{
		printf("auxiliary malloc error");
		exit(0);
	}
	for( i = 1; i <= item_size; i++ )
		auxiliary[i] = i;

	for( i = 1; i <= cluster_count; i++ )
	{
		random_index = rand() % item_length_temp + 1;
		cluster_center[i][0] = i;			//cluster ID, rang from 1 to @cluster_count
		//printf("initial random cluster center %d is %d\n", i, auxiliary[random_index]);
		for( j = 1; j <= attribute_size; j++ )
		{
			cluster_center[i][j] = adjacentMatrix[auxiliary[random_index]][j];
		}
		auxiliary[random_index] = auxiliary[item_length_temp--];
	}
	//test
	/*
	printf("the origin cluster_center information:\n");
	for( i = 1; i <= cluster_count; i++ )
	{
		for( j = 0; j <= attribute_size; j++ )
			printf("%f ", cluster_center[i][j]);
		printf("\n");
	}
	printf("the adjacent matrix after assign origin cluster center:\n");
	for( i = 1; i <= item_size; i++ )
	{
		for( j = 0; j <= attribute_size; j++ )
			printf("%d ", adjacentMatrix[i][j]);
		printf("\n");
	}
	*/
	//testEND

}

/*
 * calculate the distance between anyone itemID and one centerID, and save the result to 
 * array @distanceToCenter, which each element corresponds to one @cluster_center.
 * */
void calculateDistance_ToOneCenter(int itemID, int centerID)
{
	int i;
	double distance = 0.0;
	for( i = 1; i <= attribute_size; i++ )
	{
		distance = distance + (adjacentMatrix[itemID][i] - cluster_center[centerID][i]) * (adjacentMatrix[itemID][i] - cluster_center[centerID][i]);
	}
	distanceToCenter[centerID] = sqrt((double)distance);
}

/*
 * caluclate the distance of @itemID to all cluster center
 * */
void calculateDistance_ToAllCenter(int itemID)
{
	int center;
	for( center = 1; center <= cluster_count; center++ )
	{
		calculateDistance_ToOneCenter(itemID, center);
	}

	//test
	/*
	int i;
	printf("the distance of item %d to all center:", itemID);
	for( i = 1; i <= cluster_count; i++ )
		printf("%f\t", distanceToCenter[i]);
	printf("\n");
	*/
}

/*
 * after get the information of one @agentID to all cluster center, we can decide which cluster the 
 * @agentID should be belong to by comparing the distanceToCenter.
 * */
void partition_forOneItem(int agentID, int round)
{
	//In the adjacent matrix, the element with index 0 is not to store attributeValue data, but to denote
	//the information of cluster center. 
	int i;
	double min_value = distanceToCenter[1];
	int min_index = 1;
	for( i = 2; i <= cluster_count; i++ )
	{
		if( distanceToCenter[i] < min_value )
		{
			min_value = distanceToCenter[i];
			min_index = i;
		}
	}
	adjacentMatrix[agentID][0] = min_index;

	squaredError = squaredError + min_value;
}

/*
 * For anyone item give a partition on one round
 * */
void partition_forAllItem_OneCluster(int round)
{	
	squaredError = 0.0;
	int item_traverse;
	for( item_traverse = 1; item_traverse <= item_size; item_traverse++ )
	{
		calculateDistance_ToAllCenter(item_traverse);
		partition_forOneItem(item_traverse, round);
	}

	//test
	/*
	int i, j;
	for( i = 1; i <= item_size; i++ )
	{
		for( j = 0; j <= attribute_size; j++ )
		{
			if( j == 0 )
				printf("%d---", adjacentMatrix[i][j]);
			else
				printf("%d ", adjacentMatrix[i][j]);

		}
		printf("\n");
	}
	//test END
	*/

	//we need to save to cluster information after once clustering
	writeToFile_oneCluster(round);

	printf("\nThe squared_error is %f\n", squaredError);
}

/*
 * save the result after once clustering
 * */
void writeToFile_oneCluster(int round)
{
	char filename[200] = "";
	FILE** fwrite;
	fwrite = (FILE**)malloc(sizeof(FILE*) * (cluster_count + 1));
	if( !fwrite )
	{
		printf("fwrite malloc error");
		exit(0);
	}
	int i;
	//open file
	for( i = 1; i <= cluster_count; i++ )
	{
		sprintf(filename, ".//ClusterProcess//round%d_cluster%d.data", round, i);
		if( NULL == (fwrite[i] = fopen(filename, "w")))
		{
			printf("open file(%s) error: ", filename);
			exit(0);
		}
	}
	//write data to file
	for( i = 1; i <= item_size; i++ )
	{
		fprintf(fwrite[adjacentMatrix[i][0]], "%d\n", i);
	}
	//close file
	for( i = 1; i <= cluster_count; i++ )
	{
		fclose(fwrite[i]);
	}
}

/*
 * calculate the new cluster center
 * */
void calculate_clusterCenter(int round)
{
	//statistic the number of each cluster, the value of ith indicate the number of cluster ith
	int i, j;
	int* number_each_cluster;
	number_each_cluster = (int*)malloc(sizeof(int) * (cluster_count + 1));
	if( !number_each_cluster )
	{
		printf("number_each_cluster malloc error!");
		exit(0);
	}
	//initial 
	for( i = 1; i <= cluster_count; i++ )
		number_each_cluster[i] = 0;

	//ith row store the sum of all item in cluster ith
	double** sum;
	sum = (double**)malloc(sizeof(double*) * (cluster_count + 1));
	if( !sum )
	{
		printf("sum malloc error");
		exit(0);
	}
	for( i = 1; i <= cluster_count; i++ )
	{
		sum[i] = (double*)malloc(sizeof(double) * (attribute_size + 1));
		if( !sum[i] )
		{
			printf("sum malloc error: %d" , i);
			exit(0);
		}
	}
	//initial
	for( i = 1; i <= cluster_count; i++ )
		for( j = 1; j <= attribute_size; j++ ) 
			sum[i][j] = 0.0;
	

	for( i = 1; i <= item_size; i++ )
	{
		number_each_cluster[adjacentMatrix[i][0]]++;
		for( j = 1; j <= attribute_size; j++ )
		{
			sum[adjacentMatrix[i][0]][j] += adjacentMatrix[i][j];
		}
	}
	for( i = 1; i <= cluster_count; i++ )
	{
		for( j = 1; j <= attribute_size; j++ )
		{
			if( number_each_cluster[i] == 0 )
			{
				printf("divisor is 0");
				exit(0);
			}
			sum[i][j] = sum[i][j] / (double)number_each_cluster[i];
			//printf("%f ", sum[i][j]);
		}
		//printf("\n");
	}
	
	/*
	 * compare the distance between new cluster center and old cluster center
	 * */
	compareNewAndOldClusterCenter(sum);
	
	//update the cluster center
	for( i = 1; i <= cluster_count; i++ )
	{
		for( j = 1; j <= attribute_size; j++ )
		{
			cluster_center[i][j] = sum[i][j];
		}
	}
	
	//test
	/*
	printf("size of each cluster:");
	for( i = 1; i <= cluster_count; i++ )
		printf("%d--------%d ", i, number_each_cluster[i]);
	printf("\n");
	printf("information of new cluster center:\n");
	for( i = 1; i <= cluster_count; i++ )
	{
		for( j = 1; j <= attribute_size; j++ )
		{
			printf("%f ", cluster_center[i][j]);
		}
		printf("\n");
	}
	*/
	//test END
}

/*
 * compare the square error between new and old cluster center
 * */
void compareNewAndOldClusterCenter(double** sum)
{
	int i, j;
	isContinue = 0;			// 0 represents will be stop
	for( i = 1; i <= cluster_count; i++ )
	{
		for( j = 1; j <= attribute_size; j++ )
		{
			if( sum[i][j] != cluster_center[i][j] )
			{
				isContinue = 1;
				break;
			}
		}
		if( isContinue == 1 )
			break;
	}
}

/*******************************************************************************************
 *				K-means algorithm
 *******************************************************************************************/
void kmeans()
{
	int step;
	for( step = 1; step <= MAX_ROUND; step++ )
	{
		printf("------------------round %d---------------------\n", step);
		partition_forAllItem_OneCluster(step);
		calculate_clusterCenter(step);
		if( 0 == isContinue )
		{
			//printf("\t\t\t\t\t\tthis application can be stop\n");
			break;
		}
	}
}
</span>


1
0

查看评论
* 以上用户言论只代表其个人观点,不代表CSDN网站的观点或立场
    个人资料
    • 访问:70986次
    • 积分:1441
    • 等级:
    • 排名:千里之外
    • 原创:75篇
    • 转载:0篇
    • 译文:0篇
    • 评论:48条
    最新评论