C均值聚类
算法步骤
- 在样本集合中选择C个点作为初始类中心;
- 在剩下的样本点中选择一个,计算其到各个中心点的距离,选取距离最短者将其归为那个类别;
- 选择下一个样本,重复2直到计算完所有样本,若集合不发生变化或达到迭代上限则转5否则转4;
- 根据当前的类划分情况重新计算中心点,重复步骤2;
- 结束算法。
C实现
/*
@Time : 2020/12/4 0:04
@Author : Li Canghao
@Name : C_means.py
@Software : C-Free
*/
#include<stdio.h>
#include<math.h>
const long long maxn = 10004;
typedef struct twoD{
double x,y;
}twoD;
double Distance(twoD a,twoD b){ //计算距离
return fabs(sqrt((a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y)));
}
void InitCenters(twoD trains[],int c,twoD centers[]){ //初始化类中心
for(int i = 0; i < c; i++){
centers[i].x = trains[i].x;
centers[i].y = trains[i].y;
//printf("%lf %lf\n",centers[i].x,centers[i].y) ;
}
}
void SearchMinDistance(int index,twoD train,int c,twoD centers[],int belong[],int cnt[]){ //寻找距离最短
double mindistance = 1e6;
int minindex = 0;
for(int i= 0; i < c; i++){
if(Distance(train,centers[i])<mindistance){
mindistance = Distance(train,centers[i]);
minindex = i;
}
}
belong[index] = minindex; //该模式属于minindex类
cnt[minindex]++; //该类模式数增加
//printf("belong[%d] = %d cnt[%d] = %d\n",index,minindex,minindex,cnt[minindex]);
}
void C_mean(int n,twoD trains[],int c){ //C均值主程序
twoD centers[maxn]; //存放类心
twoD avg[c]; //用于计算类心
int belong[n]; //存储集合关系
int cnt[c]; //存储一个类有多少模式
int counts = 0; //判断集合是否不再变化
InitCenters(trains,c,centers);
while(counts < c){
for(int i = 0; i < c; i++)cnt[i] = 0;
for(int i = 0; i < n; i++){ //每一模式与类心距离按照最小距离归类
SearchMinDistance(i,trains[i],c,centers,belong,cnt);
}
for(int i = 0; i < c; i++){ //为计算新的类心初始化
avg[i].x = 0;
avg[i].y = 0;
}
for(int i = 0; i < n; i++){ //准备计算每一类的类心
avg[belong[i]].x += trains[i].x;
avg[belong[i]].y += trains[i].y;
}
counts = 0;
for(int i = 0;i < c; i++){
avg[i].x /= cnt[i]; //计算类心
avg[i].y /= cnt[i];
if(((centers[i].x-avg[i].x)<1e-6)&&((centers[i].y-avg[i].y)<1e-6)){ //如果新的类心与原类心差距十分小,就认为没有更新
counts++;
}
//printf("count = %d\n",counts);
centers[i].x = avg[i].x; //更新类心
centers[i].y = avg[i].y;
}
}
printf("\n-----处理完毕,展示结果-----\n");
for(int i = 0; i < c; i++){
printf("当前第%d类,聚类中心为:(%lf,%lf) 共有%d个模式,其中的集合为:\n",i+1,centers[i].x,centers[i].y,cnt[i]);
for(int j = 0; j < n; j++){
if(belong[j] == i)printf("\t%d:(%lf,%lf)\n",j+1,trains[j].x,trains[j].y);
}
}
}
int main(){ //测试
twoD trains[maxn];
int n,c;
printf("请输入需要分成多少类:");
scanf("%d",&c);
printf("请输入模式总个数:(n > c)");
scanf("%d",&n);
printf("请输入各模式的特征值(二维):\n");
for(int i = 0; i < n; i++){
scanf("%lf%lf",&trains[i].x,&trains[i].y);
}
printf("-----开始C均值聚类-----\n");
C_mean(n,trains,c);
printf("-----C均值聚类结束-----\n");
return 0;
}
测试数据 少量样本
1 2
4 5
7 3
100 20
90 50
-5 6
50 89
2000 414
2000 808
2020 124
Python实现
# -*- codeing = utf-8 -*-
# @Time : 2020/12/4 0:04
# @Author : Li Canghao
# @Name : C_means.py
# @Software : PyCharm
import math
import random
import matplotlib.pyplot as plt #用于做图
class twoD:
x = 0.0
y = 0.0
belong = 0 #所属哪个类
def __init__(self,x,y):
self.x = float(x)
self.y = float(y)
def __add__(self, other):
self.x += other.x
self.y += other.y
def toString(self):
return str(self.x) + " " + str(self.y)
def Distance(a,b):
return math.sqrt((a.x-b.x)**2 + (a.y-b.y)**2)
def GenerateTrains(): #随机生成训练集并写入文件中
number = random.randint(100,200)
try:
f = open("trains.txt","w") #因为测试了N次,为了不占用空间,用了w模式而不是a追加
try:
for i in range(number):
temp = twoD(random.random()*4041-2020,random.random()*4041-2020) #数据范围为[-2020,2020]
f.write(temp.toString())
if i != number-1:
f.write("\n")
print("-----创建训练集成功-----")
finally:
f.close()
print("-----文件关闭-----")
except Exception as ex:
print("-----出现异常",ex,"-----")
def ReadTrains(): #读取训练集
trains = []
try:
f = open("trains.txt","r")
print("-----读取训练集成功-----")
try:
for line in f.readlines():
train = line.split()
temp = twoD(train[0],train[1])
trains.append(temp)
return trains
finally:
f.close()
print("-----文件关闭-----")
except Exception as ex:
print("-----出现异常",ex,"-----")
def C_mean(trains,c):
centers = trains[0:c] #[切片],选择C个点作为初始类心,这里将前c个作为初始类心
new_centers =[] #记录新的类心
numbers = [] #桶,记录一个类有多少个模式
counts = 0 #记录未变的类的数量
while counts < c:
numbers = [0 for i in range(c)]
#new_centers = [twoD for i in range(c)] #计算每个类 新的类心 #该写法有问题,指向同一个twoD,改一个全部都会更改,浅复制
new_centers = [twoD(0,0) for i in range(c)] #深复制,不会指向同一个目标
for i in range(len(trains)): #遍历样本点
mindistance = 1e7 #最小距离
minindex = 0 #记录离哪个类心距离近
for j in range(len(centers)): #遍历当前点,找到距离最小的类心
if Distance(trains[i],centers[j]) < mindistance:
mindistance = Distance(trains[i],centers[j])
minindex = j
trains[i].belong = minindex #归属minindex类
new_centers[minindex].x += trains[i].x #计算新的类心 先算总的x,y
new_centers[minindex].y += trains[i].y
#for n,z in enumerate(new_centers):
# print(n,z.x,z.y)
#print("new_centers[%d].x = %d,new_centers[%d].y = %d"%(minindex,new_centers[minindex].x,minindex,new_centers[minindex].y))
numbers[minindex] += 1 #该类中 模式的数量+1
'''for n,z in enumerate(numbers):
print(n,z)'''
#print("trains[%d].belong = %d,numbers[%d] = %d"%(i,trains[i].belong,minindex,numbers[minindex]))
''' for i in new_centers:
print(i.x,i.y)
print("-"*30)'''
for i,center in enumerate(centers): #遍历类心,比较新类心和旧类心是否发生变化
#print("new_centers[%d].x = %d,new_centers[%d].y = %d"%(i,new_centers[i].x,i,new_centers[i].y))
new_centers[i].x /= float(numbers[i])
new_centers[i].y /= float(numbers[i])
#print("new_centers[%d].x = %d,new_centers[%d].y = %d"%(i,new_centers[i].x,i,new_centers[i].y))
if ((new_centers[i].x - center.x < 1e-6) and (new_centers[i].y - center.y < 1e-6)):
counts += 1 #未变的类数量+1
centers[i] = new_centers[i] #更新类心
print("-----处理完毕,展示结果-----")
colors = ["red","blue","green","coral","tan","yellow","brown","gold","orange","peru"]
marks = ["+","x","o","v","^","<",">","1","2","3"]
for i,center in enumerate(centers):
print("当前第%d类,类心为:(%d,%d) 共有%d个模式,它们分别是:"%(i + 1,center.x,center.y,numbers[i]))
for j,train in enumerate(trains):
if train.belong == i:
print("\t%d:(%d,%d)"%(j+1,train.x,train.y))
plt.scatter(train.x,train.y,marker = marks[i],c = colors[i])
plt.show()
'''def ShowPlot():
for i in range(c):
plt.scatter()'''
print("-----准备创建训练集-----")
GenerateTrains()
print("-----准备读取训练集-----")
trains = ReadTrains()
print(len(trains))
c = int(input("请输入需要分成多少类"))
print("-----C均值聚类开始-----")
C_mean(trains,c)
print("-----C均值聚类结束-----")
#ShowPlot()
测试数据在GenerateTrains()函数中创建
结果
为了能直观的看出来,截了Python实现中产生的图,这里c=5
后话
一天学完Python并实现上面这个算法真是太nice了(头晕目眩)