这也是人工智能课要求的一个小实验~
K-Means聚类原理大家应该很熟悉了就不介绍了,红酒数据大家可以自行下载。
代码如下:
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 22 16:22:15 2021
@author: Overcoming
"""
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']#解决不能显示中文问题
plt.rcParams['axes.unicode_minus']=False #解决负数坐标显示问题
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
path = 'wine.data' #wine.data 红酒文件路径 !!注意如果路径中有中文会报错,我就出现了这种情况。
#df = pd.read_csv(path,header=None,index_col=0,names=['feature0','feature1','feature2','feature3','feature4','feature5','feature6','feature7','feature8','feature9','feature10','feature11''feature12','feature13'])
df = pd.read_csv(path,header=None) #以第一列为索引,直接去除第一列,转换为列表就直接去除了。#其次不加header = None 的话会把第一行当成索引
data = df.values #得先降维!
comp = 2 #降成两维
number = len(data)
#number_featuer = len(data[0])
def getPCAData(data,comp):
pcaClf = PCA(n_components=comp, whiten=True) #白化,每一维的特征做一个标准差归一化处理
pcaClf.fit(data[:,1:])
data_PCA = pcaClf.transform(data[:,1:]) # 用来降低维度
#data_PCA = data_PCA.tolist()
tru_cla = data[:,0] #取出真实分类
return data_PCA ,tru_cla
def K_val():
#此函数调用KMeans库用来确定K值取值
distortions = []
K = range(1, 10)
for k in K:
kmeanModel = KMeans(n_clusters=k).fit(data_PCA)
distortions.append(sum(np.min(cdist(data_PCA, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / data_PCA.shape[0])
# Plot the elbow
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('误差平方和') #SSE
plt.title('肘方法显示最优k值') #'The Elbow Method showing the optimal k'
plt.show()
def ori(number,k,data_PCA):
#生成最初的聚类中心
i = 0
center = []
while i<k:
a = random.randint(0,number-1)
a = data_PCA[a]
center.append(a)
i +=1
#print(center)
return center
'''def dist(a,b):
#算距离
d =np.linalg.norm(np.array(a)-np.array(b))
return d'''
#经测试上面计算距离算法慢于下列算法
def dist(a,b):
#算距离
xy = np.vstack([a,b])
d = pdist(xy)
return d
def gather(center,data):
# center1 = np.zeros((k,comp)) #迭代后初始质心设为0
#center1 = center
ga = [[] for i in range(k)] #k个分类
for i in range(number):
d = []
for j in range(k):
d.append(dist(data_PCA[i],center[j]))
#print(type(d))
index_gather = d.index(min(d)) #算出距离最近的中心点
ga[index_gather].append(data_PCA[i])#分入其类
for i in range(k):
center[i] = np.mean(ga[i],axis=0)#对数组的列求平均
#print(d)
return ga,center
def Rate(k,ga,data,tru_cla):
if k != 3:
print('不能计算聚类正确率!')
else:
right =[] #存储为在U中为同一类且在V中也为同一类别的数据点数
a = [[[] for i in range(k)] for j in range(k)] #存储原来第i类分为第j类的点
b = [[[] for i in range(k)] for j in range(k)]#存储原来第i类分为第j类点的个数
index = [[] for i in range(k)] #原来同一类的点的index
for i in range(len(data)):
if tru_cla[i] ==1:
index[0].append(i) #原来第一类的点的index
elif tru_cla[i] ==2:
index[1].append(i)#原来第二类的点的index
elif tru_cla[i] ==3:
index[2].append(i)#原来第三类的点的index
for j in range(k):
for i in index[j]:
for data[i] in ga[0]: #这里只能用for循环去一个一个比较。
a[j][0].append(1)
b[j][0] = sum(a[j][0])
for data[i] in ga[1]:
a[j][1].append(1)
b[j][1] = sum(a[j][1])
for data[i] in ga[2]:
a[j][2].append(1)
b[j][2] = sum(a[j][2])
right.append(max(b[j][0],b[j][1],b[j][2])) #取存储原来第i类分为第j类点的个数的最大值作为正确分类的点的个数
rate = sum(right)/len(data)# 为在U中为同一类且在V中也为同一类别的数据点对数/总点数
print('聚类正确率:',rate)
return rate
def paint():
x =[[] for i in range(len(ga))]#聚类后每个类中的点的横坐标
y =[[] for i in range(len(ga))]
xc=[]
yc=[]
#cl = ['c', 'b', 'g', 'r', 'm', 'y', 'k', 'w']
for j in range(len(ga)):
for i in range(len(ga[j])):
x[j].append(ga[j][i][0])
y[j].append(ga[j][i][1])
plt.scatter(x[j],y[j]) #散点图画出k个类的点
for i in range(k):
xc.append(center[i][0]) #聚类中心横坐标
yc.append(center[i][1]) #聚类中心纵坐标
plt. text (center[i][0],center[i][1],'CENTER')
#x = x1+x2+x3
#y = y1+y2+y3
plt.scatter(xc,yc,marker= '*',s = 200,alpha = 1) #画出聚类中心
plt.show()
if __name__ == '__main__':
data_PCA,tru_cla = getPCAData(data,comp)
K_val()
k = int(input('聚类k值:') ) #聚类数量
center = ori(number,k,data_PCA)
ga,center1 = gather(center,data_PCA)
A = [] #用于存储前一次迭代的聚类中心坐标
for i in range(k):
A.append(center1[i].tolist())
#print(a)
#a = center1[0].tolist() #前一次迭代的第一个聚类中心坐标
#print(center1)
#print(a)
ga,center= gather(center1,data_PCA)
B = [] #用于存储当前迭代的聚类中心坐标
for i in range(k):
B.append(center[i].tolist())
#b = center[0].tolist() #当前第一个聚类中心坐标
# print(center)
# print(b)
#print(a != b)
jj = 0
while A != B: #!此处比较聚类中心坐标是否相同来确定迭代是否结束。
A = []
B = []
jj =jj+1
print('迭代%d次'%jj)
ga,center1 = gather(center,data_PCA)
for i in range(k):
A.append(center1[i].tolist())
#print(a)
ga,center = gather(center1,data_PCA)
for i in range(k):
B.append(center[i].tolist())
#print(b)
paint()
Rate(k,ga,data_PCA,tru_cla)
求大佬指教