相关学习链接
- 视频(https://www.bilibili.com/video/BV18J411a7yY?p=1)
- 博客:(https://blog.csdn.net/in_nocence/article/details/78306297)
- 文档:(https://wenku.baidu.com/view/ee968c00eff9aef8941e06a2.html)
- 代码:(https://blog.csdn.net/zwqhehe/article/details/75174918)
上面第一个链接视频是台湾一位老师将的课程,可谓是相当详细,可以把一个人从零讲懂,里面的原理讲的很透彻,所有推导过程都讲了,虽然视频时间较长,但是强烈推荐看完前面三节的推导过程,一定会收货颇多。看完了该的视频,基本上原理就很了解了,然后看下一些博客和代码就会十分轻松,基本就掌握了FCM,以上也是我本人学习FCM的过程。
学习总结:
前段时间刚刚学习过K-means聚类算法,然后最近再看FCM,其实有了k-means的基础学起来要容易理解的多。他们的共同点都是基于无监督的聚类算法,算法流程基本一致:
- 指明聚类数量;
- 初始聚类中心;
- 计算每个点到聚类中心的距离,配分每个点的归属类别;
- 从新计算聚类中心;
- 判断条件是否达到,否则重复3.
FCM的与k-means的区别在于,在计算每个点间距离是并不是直接使用欧式距离,而是加入了每个点间的权重系数,也就是每个点间的距离等于欧式距离与权重系数之积,其它内容完全一致。稍微复杂的难点就是如何计算出每个点间的权重系数矩阵U,在上面学习链接中的视频和博客对计算U做了详细推导。
代码:
下面附上本人对参考链接里面的代码进行修改后的代码:
1.main.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/5/20 18:23
# @Author : ystraw
# @Site :
# @File : main.py
# @Software: PyCharm Community Edition
# @function: FCM 模糊C均值聚类
import copy
import math
import random
import time
import pandas as pd
# 用于初始化隶属度矩阵U
global MAX
MAX = 10000.0
# 用于结束条件
global Epsilon
Epsilon = 0.00000001
def import_data_format_iris(file):
"""
格式化数据,前四列为data,最后一列为cluster_location
数据地址 http://archive.ics.uci.edu/ml/machine-learning-databases/iris/
"""
data = []
cluster_location =[]
with open(str(file), 'r') as f:
for line in f:
if line == '\n':
continue
current = line.strip().split(",")
current_dummy = []
for j in range(0, len(current)-1):
current_dummy.append(float(current[j]))
# print(len(current), j)
j += 1
if current[j] == "Iris-setosa\n":
cluster_location.append(0)
elif current[j] == "Iris-versicolor\n":
cluster_location.append(1)
else:
cluster_location.append(2)
data.append(current_dummy)
print("加载数据完毕")
return data , cluster_location
def randomise_data(data):
"""
该功能将数据随机化,并保持随机化顺序的记录
"""
order = list(range(0, len(data)))
random.shuffle(order)
new_data = [[] for i in range(0, len(data))]
for index in range(0, len(order)):
new_data[index] = data[order[index]]
return new_data, order
def de_randomise_data(data, order):
"""
此函数将返回数据的原始顺序,将randomise_data()返回的order列表作为参数
"""
new_data = [[]for i in range(0, len(data))]
for index in range(len(order)):
new_data[order[index]] = data[index]
return new_data
def print_matrix(list):
"""
以可重复的方式打印矩阵
"""
for i in range(0, len(list)):
print (list[i])
def initialise_U(data, cluster_number):
"""
这个函数是隶属度矩阵U的每行加起来都为1. 此处需要一个全局变量MAX.
实现方法:随机n个1-MAX的数,然后将每个数除以n个数的和,就可以得到了n个0-1之间的小数,且和为1.
"""
global MAX
U = [] # 最终大小为:n*k (n为数点个数,k为聚类中心数量)
for i in range(0, len(data)):
current = []
rand_sum = 0.0
for j in range(0, cluster_number):
dummy = random.randint(1, int(MAX))
current.append(dummy)
rand_sum += dummy
for j in range(0, cluster_number):
current[j] = current[j] / rand_sum
U.append(current)
return U
def distance(point, center):
"""
该函数计算2点之间的距离(作为列表)。我们指欧几里德距离。闵可夫斯基距离
"""
if len(point) != len(center):
return -1
dummy = 0.0
for i in range(0, len(point)):
dummy += abs(point[i] - center[i]) ** 2
return math.sqrt(dummy)
def end_conditon(U, U_old):
"""
结束条件。当U矩阵随着连续迭代停止变化时,触发结束
"""
global Epsilon
for i in range(0, len(U)):
for j in range(0, len(U[0])):
if abs(U[i][j] - U_old[i][j]) > Epsilon:
return False
return True
def normalise_U(U):
"""
在聚类结束时使U模糊化。每个样本的隶属度最大的为1,其余为0
"""
for i in range(0, len(U)):
maximum = max(U[i])
for j in range(0, len(U[0])):
if U[i][j] != maximum:
U[i][j] = 0
else:
U[i][j] = 1
return U
# m的最佳取值范围为[1.5,2.5]
def fuzzy(data, cluster_number, m):
"""
这是主函数,它将计算所需的聚类中心,并返回最终的归一化隶属矩阵U.
参数是:簇数(cluster_number)和隶属度的因子(m)
"""
# 初始化隶属度矩阵U
U = initialise_U(data, cluster_number)
# print_matrix(U)
# 循环更新U
while True:
# 创建它的副本,以检查结束条件
U_old = copy.deepcopy(U)
# 计算聚类中心
C = []
for j in range(0, cluster_number):
current_cluster_center = []
for i in range(0, len(data[0])):
dummy_sum_num = 0.0
dummy_sum_dum = 0.0
for k in range(0, len(data)):
# 分子
dummy_sum_num += (U[k][j] ** m) * data[k][i]
# 分母
dummy_sum_dum += (U[k][j] ** m)
# 第i列的聚类中心
current_cluster_center.append(dummy_sum_num/dummy_sum_dum)
# 第j簇的所有聚类中心
C.append(current_cluster_center)
# 创建一个距离向量(即每个点到每个聚类中心的距离), 用于计算U矩阵。
distance_matrix =[]
for i in range(0, len(data)):
current = []
for j in range(0, cluster_number):
current.append(distance(data[i], C[j]))
distance_matrix.append(current)
# 更新U
for j in range(0, cluster_number):
for i in range(0, len(data)):
dummy = 0.0
for k in range(0, cluster_number):
# 分母
dummy += (distance_matrix[i][j] / distance_matrix[i][k]) ** (2/(m-1))
U[i][j] = 1 / dummy
if end_conditon(U, U_old):
print("结束聚类")
break
print("标准化 U")
U = normalise_U(U)
return U, C
def checker_iris(final_location):
"""
和真实的聚类结果进行校验比对
"""
right = 0.0
for k in range(0, 3):
checker = [0, 0, 0]
for i in range(0, 50):
for j in range(0, len(final_location[0])):
if final_location[i + (50*k)][j] == 1:
checker[j] += 1
right += max(checker)
print(right)
answer = right / 150 * 100
return "准确度:" + str(answer) + "%"
if __name__ == '__main__':
# 利用pandas读取自己需要数据集
dataset = pd.read_excel('./data/数据.xlsx', index_col=0, header=1)
print(dataset.shape)
# 随机化数据
data, order = randomise_data(dataset.values)
# print_matrix(data)
start = time.time()
# 调用模糊C均值函数: 返回U和聚类中心C
final_location, C = fuzzy(data, 5, 2)
# print(final_location)
# print(C)
# 还原数据顺序
final_location = de_randomise_data(final_location, order)
# 整理出所属类别:
type = [nums.index(1) + 1 for nums in final_location]
print(type)
dataset['类别'] = type
# 存储样本聚类结果
dataset.to_excel('./data/result.xlsx')
# 存储聚类中心:
df = pd.DataFrame(C)
df.columns = dataset.columns[:-1]
print('聚类中心:\n', df)
df.to_excel('./data/center.xlsx')
print("用时:{0}".format(time.time() - start))
2.analysis.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/5/20 23:07
# @Author : ystraw
# @Site :
# @File : analysis.py
# @Software: PyCharm Community Edition
# @function: 绘图
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# 中文和负号的正常显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.serif'] = ['SimHei']
# 绘雷达图
'''
描绘客户特征分析雷达图
'''
def draw1():
# 读取聚类中心
data = pd.read_excel('./data/center.xlsx', index_col=0)
print(data.head())
# 中文和负号的正常显示
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
# 设置标签
labels = data.columns
# 设置数据长度
dataLenth = len(labels)
# 设置数据
data_radar = data.values
# 分割圆周长
angles = np.linspace(0, 2*np.pi, dataLenth, endpoint=False)
# 角度闭合
angles = np.concatenate((angles, [angles[0]]))
# 设置每组的线型
style = ['r-', 'o--', 'g-.', 'b:', 'p-'] * 2
for i in range(len(data_radar)):
# 使每组数据闭合
data_radar_1 = np.concatenate((data_radar[i], [data_radar[i][0]]))
# 做极坐标系
plt.polar(angles, data_radar_1, style[i], linewidth=2)
# 贴上标签
plt.thetagrids(angles * 180/np.pi, labels)
# plt.ylim(0, 70)
# 添加客户类别:
kflable = ['类别' + str(i+1) for i in range(dataLenth)]
plt.legend(kflable, bbox_to_anchor=(1.25, 1.15))
plt.title(u'聚类中心分析雷达图')
plt.savefig('./image/聚类中心分析雷达图.png')
plt.show()
def draw2():
# 读入数据
data = pd.read_excel('./data/result.xlsx', index_col=0)
print(data.head())
# 调节图形大小,宽,高
# plt.figure(figsize=(8,5))
# 类别数据
type = data['类别']
# 获得统计的数据
x = type.value_counts().values
label = type.value_counts().index.values
print('结果统计:\n', type.value_counts())
# 设置饼状图各个区块的颜色
color=['aqua', 'linen', 'lightcoral', 'olive', 'gold']
# 绘图并获取返回值:p_texts饼图内部文本的,l_texts饼图外label的文本
patches, l_text, p_text = plt.pie(x,autopct='%3.1f%%',radius=0.5,pctdistance=0.85,colors=color,wedgeprops=dict(linewidth=2,width=0.3,edgecolor='w'))
# 添加图例
legend_text = ['类别' + str(label[i]) for i in range(len(label))]
# 设置图例标题、位置
legend = plt.legend(legend_text, title='类别', loc='center right',bbox_to_anchor=(1.12,0.8),fontsize=13)
# 修改图例标题字体大小
legend.get_title().set_fontsize(12)
#改变文本(比例)的大小: 方法是把每一个text遍历。调用set_size方法设置它的属性
for t,type in zip(p_text, legend_text):
t.set_size(13)
t.set_text(t.get_text() + '\n'+ type)
# plt.title('不同群占比图', fontsize=27)
# 设置坐标轴比例(x,y轴刻度一致)以显示为圆形
plt.axis('equal')
plt.savefig('image/类别统计饼图.png')
plt.show()
if __name__ == '__main__':
# 绘制聚类中心的图:
draw1()
# 聚类结果分析:
draw2()