C均值聚类 C实现 Python实现

C均值聚类

算法步骤

  1. 在样本集合中选择C个点作为初始类中心;
  2. 在剩下的样本点中选择一个,计算其到各个中心点的距离,选取距离最短者将其归为那个类别;
  3. 选择下一个样本,重复2直到计算完所有样本,若集合不发生变化或达到迭代上限则转5否则转4;
  4. 根据当前的类划分情况重新计算中心点,重复步骤2;
  5. 结束算法。

C实现

/*
	@Time : 2020/12/4 0:04
	@Author : Li Canghao
	@Name : C_means.py
	@Software : C-Free
*/
#include<stdio.h>
#include<math.h>
const long long maxn = 10004;
typedef struct twoD{
	double x,y; 
}twoD;
double Distance(twoD a,twoD b){					//计算距离 
	return fabs(sqrt((a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y)));
}
void InitCenters(twoD trains[],int c,twoD centers[]){				//初始化类中心 
	for(int i = 0; i < c; i++){
		centers[i].x = trains[i].x;
		centers[i].y = trains[i].y;
		//printf("%lf %lf\n",centers[i].x,centers[i].y) ;
	}
}
void SearchMinDistance(int index,twoD train,int c,twoD centers[],int belong[],int cnt[]){		//寻找距离最短 
	double mindistance = 1e6;
	int minindex = 0;
	for(int i= 0; i < c; i++){
		if(Distance(train,centers[i])<mindistance){
			mindistance = Distance(train,centers[i]);
			minindex = i;
		}
	}
	belong[index] = minindex;					//该模式属于minindex类 
	cnt[minindex]++;							//该类模式数增加 
	//printf("belong[%d] = %d  cnt[%d] = %d\n",index,minindex,minindex,cnt[minindex]);
} 

void C_mean(int n,twoD trains[],int c){			//C均值主程序 
	twoD centers[maxn];							//存放类心 
	twoD avg[c];								//用于计算类心 
	int belong[n];								//存储集合关系 
	int cnt[c];									//存储一个类有多少模式 
	int counts = 0;								//判断集合是否不再变化 
	InitCenters(trains,c,centers);
	while(counts < c){
		for(int i = 0; i < c; i++)cnt[i] = 0; 
		for(int i = 0; i < n; i++){				//每一模式与类心距离按照最小距离归类 
			SearchMinDistance(i,trains[i],c,centers,belong,cnt);
		} 
		for(int i = 0; i < c; i++){				//为计算新的类心初始化 
			avg[i].x = 0;
			avg[i].y = 0;
		}
		for(int i = 0; i < n; i++){				//准备计算每一类的类心 
			avg[belong[i]].x += trains[i].x;
			avg[belong[i]].y += trains[i].y;
		}
		counts = 0;
		for(int i = 0;i < c; i++){
			avg[i].x /= cnt[i];					//计算类心 
			avg[i].y /= cnt[i];
			if(((centers[i].x-avg[i].x)<1e-6)&&((centers[i].y-avg[i].y)<1e-6)){	//如果新的类心与原类心差距十分小,就认为没有更新 
				counts++;
			}
			//printf("count = %d\n",counts);
			centers[i].x = avg[i].x;			//更新类心 
			centers[i].y = avg[i].y;
		}
		
	}
	printf("\n-----处理完毕,展示结果-----\n");
	for(int i = 0; i < c; i++){
		printf("当前第%d类,聚类中心为:(%lf,%lf) 共有%d个模式,其中的集合为:\n",i+1,centers[i].x,centers[i].y,cnt[i]); 
		for(int j = 0; j < n; j++){
			if(belong[j] == i)printf("\t%d:(%lf,%lf)\n",j+1,trains[j].x,trains[j].y);
		}
	}
} 
int main(){										//测试 
	twoD trains[maxn];
	int n,c;
	printf("请输入需要分成多少类:");
	scanf("%d",&c);
	printf("请输入模式总个数:(n > c)");
	scanf("%d",&n);
	printf("请输入各模式的特征值(二维):\n");
	for(int i = 0; i < n; i++){
		scanf("%lf%lf",&trains[i].x,&trains[i].y);
	}
	printf("-----开始C均值聚类-----\n");
	C_mean(n,trains,c);
	printf("-----C均值聚类结束-----\n");
	return 0;
}


测试数据 少量样本
1 2
4 5
7 3
100 20
90 50
-5 6
50 89
2000 414
2000 808
2020 124

Python实现

# -*- codeing = utf-8 -*-
# @Time : 2020/12/4 0:04
# @Author : Li Canghao
# @Name : C_means.py
# @Software : PyCharm
import math
import random
import matplotlib.pyplot as plt   #用于做图

class twoD:
    x = 0.0
    y = 0.0
    belong = 0      #所属哪个类

    def __init__(self,x,y):
        self.x = float(x)
        self.y = float(y)
    def __add__(self, other):
        self.x += other.x
        self.y += other.y
    def toString(self):
        return str(self.x) + " " + str(self.y)

def Distance(a,b):
    return math.sqrt((a.x-b.x)**2 + (a.y-b.y)**2)

def GenerateTrains():               #随机生成训练集并写入文件中
    number = random.randint(100,200)
    try:
        f = open("trains.txt","w")  #因为测试了N次,为了不占用空间,用了w模式而不是a追加
        try:
            for i in range(number):
                temp = twoD(random.random()*4041-2020,random.random()*4041-2020)  #数据范围为[-2020,2020]
                f.write(temp.toString())
                if i != number-1:
                    f.write("\n")
            print("-----创建训练集成功-----")
        finally:
            f.close()
            print("-----文件关闭-----")
    except Exception as ex:
        print("-----出现异常",ex,"-----")

def ReadTrains():                   #读取训练集
    trains = []
    try:
        f = open("trains.txt","r")
        print("-----读取训练集成功-----")
        try:
            for line in f.readlines():
                train = line.split()
                temp = twoD(train[0],train[1])
                trains.append(temp)
            return trains
        finally:
            f.close()
            print("-----文件关闭-----")
    except Exception as ex:
        print("-----出现异常",ex,"-----")

def C_mean(trains,c):
    centers = trains[0:c]                       #[切片],选择C个点作为初始类心,这里将前c个作为初始类心
    new_centers =[]                             #记录新的类心
    numbers = []                                 #桶,记录一个类有多少个模式
    counts = 0                                  #记录未变的类的数量
    while counts < c:
        numbers = [0 for i in range(c)]
        #new_centers = [twoD for i in range(c)]  #计算每个类 新的类心  #该写法有问题,指向同一个twoD,改一个全部都会更改,浅复制
        new_centers = [twoD(0,0) for i in range(c)]  #深复制,不会指向同一个目标
        for i in range(len(trains)):            #遍历样本点
            mindistance = 1e7                   #最小距离
            minindex = 0                        #记录离哪个类心距离近
            for j in range(len(centers)):       #遍历当前点,找到距离最小的类心
                if Distance(trains[i],centers[j]) < mindistance:
                    mindistance = Distance(trains[i],centers[j])
                    minindex = j
            trains[i].belong = minindex         #归属minindex类
            new_centers[minindex].x += trains[i].x         #计算新的类心 先算总的x,y
            new_centers[minindex].y += trains[i].y
            #for n,z in enumerate(new_centers):
               # print(n,z.x,z.y)
            #print("new_centers[%d].x = %d,new_centers[%d].y = %d"%(minindex,new_centers[minindex].x,minindex,new_centers[minindex].y))
            numbers[minindex] += 1              #该类中 模式的数量+1
            '''for n,z in enumerate(numbers):
                print(n,z)'''
            #print("trains[%d].belong = %d,numbers[%d] = %d"%(i,trains[i].belong,minindex,numbers[minindex]))
        '''  for i in new_centers:
                print(i.x,i.y)
            print("-"*30)'''
        for i,center in enumerate(centers):     #遍历类心,比较新类心和旧类心是否发生变化
            #print("new_centers[%d].x = %d,new_centers[%d].y = %d"%(i,new_centers[i].x,i,new_centers[i].y))
            new_centers[i].x /= float(numbers[i])
            new_centers[i].y /= float(numbers[i])
            #print("new_centers[%d].x = %d,new_centers[%d].y = %d"%(i,new_centers[i].x,i,new_centers[i].y))
            if ((new_centers[i].x - center.x < 1e-6) and (new_centers[i].y - center.y < 1e-6)):
                counts += 1                     #未变的类数量+1
            centers[i] = new_centers[i]         #更新类心
    print("-----处理完毕,展示结果-----")
    colors = ["red","blue","green","coral","tan","yellow","brown","gold","orange","peru"]
    marks = ["+","x","o","v","^","<",">","1","2","3"]
    for i,center in enumerate(centers):
        print("当前第%d类,类心为:(%d,%d) 共有%d个模式,它们分别是:"%(i + 1,center.x,center.y,numbers[i]))
        for j,train in enumerate(trains):
            if train.belong == i:
                print("\t%d:(%d,%d)"%(j+1,train.x,train.y))
                plt.scatter(train.x,train.y,marker = marks[i],c = colors[i])
    plt.show()



'''def ShowPlot():
    for i in range(c):
        plt.scatter()'''

print("-----准备创建训练集-----")
GenerateTrains()
print("-----准备读取训练集-----")
trains = ReadTrains()
print(len(trains))
c = int(input("请输入需要分成多少类"))
print("-----C均值聚类开始-----")
C_mean(trains,c)
print("-----C均值聚类结束-----")
#ShowPlot()


测试数据在GenerateTrains()函数中创建

结果

为了能直观的看出来,截了Python实现中产生的图,这里c=5
在这里插入图片描述

后话

一天学完Python并实现上面这个算法真是太nice了(头晕目眩)

已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 数字20 设计师:CSDN官方博客 返回首页