K-Means对红酒数据进行聚类||python

这也是人工智能课要求的一个小实验~

K-Means聚类原理大家应该很熟悉了就不介绍了,红酒数据大家可以自行下载。

代码如下:

# -*- coding: utf-8 -*-
"""
Created on Tue Jun 22 16:22:15 2021

@author: Overcoming
"""
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei']#解决不能显示中文问题
plt.rcParams['axes.unicode_minus']=False #解决负数坐标显示问题
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans


path = 'wine.data'  #wine.data 红酒文件路径 !!注意如果路径中有中文会报错,我就出现了这种情况。
#df = pd.read_csv(path,header=None,index_col=0,names=['feature0','feature1','feature2','feature3','feature4','feature5','feature6','feature7','feature8','feature9','feature10','feature11''feature12','feature13'])
df = pd.read_csv(path,header=None) #以第一列为索引,直接去除第一列,转换为列表就直接去除了。#其次不加header = None 的话会把第一行当成索引
data = df.values  #得先降维!
comp = 2 #降成两维


number = len(data)
#number_featuer = len(data[0])

def getPCAData(data,comp):
    pcaClf = PCA(n_components=comp, whiten=True) #白化,每一维的特征做一个标准差归一化处理
    pcaClf.fit(data[:,1:])
    data_PCA = pcaClf.transform(data[:,1:]) # 用来降低维度
    #data_PCA = data_PCA.tolist()
    tru_cla = data[:,0]  #取出真实分类
    return data_PCA ,tru_cla

def K_val():
    #此函数调用KMeans库用来确定K值取值
    distortions = []
    K = range(1, 10)
    for k in K:
        kmeanModel = KMeans(n_clusters=k).fit(data_PCA)
        distortions.append(sum(np.min(cdist(data_PCA, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / data_PCA.shape[0])
    # Plot the elbow
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('误差平方和') #SSE
    plt.title('肘方法显示最优k值')   #'The Elbow Method showing the optimal k'
    plt.show()



def ori(number,k,data_PCA):
    #生成最初的聚类中心
    i = 0
    center = []
    while i<k:
        a = random.randint(0,number-1)  
        a = data_PCA[a]
        center.append(a)
        i +=1
    #print(center)
    return center



'''def dist(a,b):
    #算距离
    d =np.linalg.norm(np.array(a)-np.array(b))
    return d'''
#经测试上面计算距离算法慢于下列算法
def dist(a,b):
    #算距离
    xy = np.vstack([a,b])
    d = pdist(xy)
    return d

def gather(center,data):
   # center1 = np.zeros((k,comp)) #迭代后初始质心设为0
    #center1 = center
    ga = [[] for i in range(k)] #k个分类
    for i in range(number):
        d = []
        for j in range(k):
             d.append(dist(data_PCA[i],center[j]))
        #print(type(d))
        index_gather = d.index(min(d)) #算出距离最近的中心点
        ga[index_gather].append(data_PCA[i])#分入其类
    for i in range(k):
        center[i] = np.mean(ga[i],axis=0)#对数组的列求平均 
    #print(d)
    return ga,center


def Rate(k,ga,data,tru_cla):
    if k != 3:
        print('不能计算聚类正确率!')
    else:
        right =[]  #存储为在U中为同一类且在V中也为同一类别的数据点数
        a = [[[] for i in range(k)] for j in range(k)] #存储原来第i类分为第j类的点
        b = [[[] for i in range(k)] for j in range(k)]#存储原来第i类分为第j类点的个数
        index = [[] for i in range(k)] #原来同一类的点的index
        for i in range(len(data)):
            if tru_cla[i] ==1:
                index[0].append(i) #原来第一类的点的index
            elif tru_cla[i] ==2:
                index[1].append(i)#原来第二类的点的index
            elif tru_cla[i] ==3:
                index[2].append(i)#原来第三类的点的index
        for j in range(k):
            for i in index[j]:
                for data[i] in ga[0]:  #这里只能用for循环去一个一个比较。
                    a[j][0].append(1)
                b[j][0] = sum(a[j][0])
                for data[i] in ga[1]:
                    a[j][1].append(1)
                b[j][1] = sum(a[j][1])
                for data[i] in ga[2]:
                    a[j][2].append(1)
                b[j][2] = sum(a[j][2])
            right.append(max(b[j][0],b[j][1],b[j][2])) #取存储原来第i类分为第j类点的个数的最大值作为正确分类的点的个数
        rate = sum(right)/len(data)#  为在U中为同一类且在V中也为同一类别的数据点对数/总点数
        print('聚类正确率:',rate)
        return rate
                    
    
    
def paint():
    x =[[] for i in range(len(ga))]#聚类后每个类中的点的横坐标
    y =[[] for i in range(len(ga))]
        
    xc=[]
    yc=[]
    #cl = ['c', 'b', 'g', 'r', 'm', 'y', 'k', 'w']
    for j in range(len(ga)):
        for i in range(len(ga[j])):
            x[j].append(ga[j][i][0])
            y[j].append(ga[j][i][1])
        plt.scatter(x[j],y[j])  #散点图画出k个类的点
    for i in range(k):
        xc.append(center[i][0]) #聚类中心横坐标
        yc.append(center[i][1]) #聚类中心纵坐标
        plt. text (center[i][0],center[i][1],'CENTER')
    #x = x1+x2+x3
    #y = y1+y2+y3
    plt.scatter(xc,yc,marker= '*',s = 200,alpha = 1)  #画出聚类中心
    plt.show()
    

    
    
if __name__ == '__main__':
    data_PCA,tru_cla = getPCAData(data,comp)
    K_val()
    k = int(input('聚类k值:') ) #聚类数量
    center = ori(number,k,data_PCA)
    ga,center1 = gather(center,data_PCA)
    A = []    #用于存储前一次迭代的聚类中心坐标
    for i in range(k):
            A.append(center1[i].tolist())
        #print(a)
    #a = center1[0].tolist() #前一次迭代的第一个聚类中心坐标
    #print(center1)
    #print(a)
    ga,center= gather(center1,data_PCA)
    B = []    #用于存储当前迭代的聚类中心坐标
    for i in range(k):
        B.append(center[i].tolist())
    #b = center[0].tolist() #当前第一个聚类中心坐标
   # print(center)
   # print(b)
  
    #print(a != b)
    jj = 0
    while A != B: #!此处比较聚类中心坐标是否相同来确定迭代是否结束。
        A = []
        B = []
        jj =jj+1
        print('迭代%d次'%jj)
        ga,center1 = gather(center,data_PCA)
        for i in range(k):
            A.append(center1[i].tolist())
        #print(a)
        ga,center = gather(center1,data_PCA)
        for i in range(k):
            B.append(center[i].tolist())
        #print(b)
    paint()
    Rate(k,ga,data_PCA,tru_cla)
    
    
    

求大佬指教

  • 5
    点赞
  • 44
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值