最近在学习粗糙集相关知识。模糊集作为相似的领域也一并了解。在应用方面,模糊集在图像分割方面应用比较广。中文资料中,模糊C均值聚类算法的资料最为丰富,因此使用模糊C均值聚类算法(FCM)作为学习的起点吧。
模糊C均值聚类算法的原理在这篇博客中讲的非常清楚。
算法原理详解.
原理的主要难点是对目标函数求偏导以获得极值点。因此算法的问题之一就是极值点不一定是最小值。
为了方便对比,我参照代码原地址重新封装了代码,并使用numpy对运行效率进行了优化。使之尽量和sklearn的调用方法相同。代码如下:
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 3 09:26:30 2019
@author: BHN
"""
import numpy as np
class FCM():
def __init__(self,n_clusters):
super(FCM, self).__init__()
self.n_clusters = n_clusters
self.u = None
self.core = None
def initialize_U(self,data_num):
u = np.random.rand(data_num, self.n_clusters)
data_max = np.sum(u, axis=1)
# print(data_max)
data_max = np.tile(data_max, (self.n_clusters,1)).T
u = np.true_divide(u,data_max)
self.u = u
return u
def fit(self,data,m=2,max_item=100):
self.u = self.initialize_U(data.shape[0])
for i in range(max_item):
############################更新聚类中心############################
new_core = []
for data_dimension in range(self.n_clusters):
u_tmp = np.tile(self.u[:,data_dimension],(data.shape[1],1)).T
u_tmp = np.power(u_tmp,m)
new_core.append(np.true_divide(np.sum(np.multiply(u_tmp,data),axis=0),\
np.sum(u_tmp,axis=0)))
new_core = np.array(new_core)
self.core = new_core
# print(new_core)
#############################计算各点与聚类中心的距离################
new_core_tmp = new_core.reshape((new_core.shape[0],1,new_core.shape[1]))
new_core_tmp = np.tile(new_core_tmp,((1,data.shape[0],1)))
data_tmp = data.reshape((1,data.shape[0],data.shape[1]))
data_tmp = np.tile(data_tmp,((self.n_clusters,1,1)))
distance_tmp = np.subtract(new_core_tmp,data_tmp)
distance_tmp = np.power(distance_tmp,2)
distance_tmp = np.sum(distance_tmp,axis=2)
distance_tmp = np.power(distance_tmp,1/2).T
#############################更新隶属矩阵###########################
tmp1 = distance_tmp.reshape((distance_tmp.shape[0],distance_tmp.shape[1],1))
tmp1 = np.tile(tmp1,(self.n_clusters))
tmp2 = np.tile(distance_tmp,(self.n_clusters))
tmp2 = tmp2.reshape((distance_tmp.shape[0],distance_tmp.shape[1],distance_tmp.shape[1]))
new_u = np.true_divide(tmp1,tmp2)
new_u = np.power(new_u,1/(m-1))
new_u = np.sum(new_u,axis=2)
new_u = np.power(new_u,-1)
#############################查看迭代是否终止########################
subtract_2_u = np.sum(np.fabs(np.subtract(self.u,new_u)))
self.u = new_u
# print(subtract_2_u)
if subtract_2_u == 0:
break
# print(new_u)
def fit_predict(self,data,m=2,max_item=1000):
ans = []
if self.u == None:
self.fit(data,m=m,max_item=max_item)
ans = np.argmax(self.u, axis=1)
return ans
if __name__ =='__main__':
cls = FCM(n_clusters=2)
# cls.fit(np.random.rand(5, 3),2)
ans = cls.fit_predict(np.random.rand(5, 3),2)
print(ans)
最后和k-mean算法进行对比发现在我对比的几种数据分布中,FCM算法的聚类效果要差于K-mean。原因可能是因为隶属函数的应用并不能帮助其改善聚类情况吧。两种算法的效果对比图如下。
导出的图片.jpg