kmeans算法

Python:

import numpy as np
import matplotlib.pylab as plt
from math import sqrt
from sklearn import datasets
class Kmeans():
    """python写的简单的K均值算法"""
    def __init__(self,data,k):
        """加载数据,鸢尾花数据集"""
        self.data = data
        self.k=k

    def distEclud(self,v1,v2):
        """计算距离"""
        return sqrt(sum(pow(v1-v2,2)))

    def randCent(self):
        """构建初始质心"""
        m,n=np.shape(self.data)
        centroids=np.zeros((self.k,n))
        for i in range(self.k):
            index=int(np.random.uniform(0,m))#从0开始到m行随机采样
            centroids[i,:]=self.data[index,:]
        return centroids#初始质心

    def km(self):
        """计算机k均值算法"""
        m=np.shape(self.data)[0]#数据集有多少行
        clusterAssment=np.array(np.zeros((m,2)))
        clusterChange=True#标志值,等于True时代表还得继续分组
        centroids=self.randCent()#获得初始质心
        while clusterChange:
            clusterChange=False
            for i in range(m):
                minDist=99999999
                minIndex=-1
                for j in range(self.k):
                    distance=self.distEclud(centroids[j,:],self.data[i,:])
                    if distance<minDist:
                        minDist=distance
                        minIndex=j
                if clusterAssment[i,0]!=minIndex:#如果元素不在最近的那个簇里 就更新每一个元素样本所属的簇
                    clusterChange=True
                    clusterAssment[i,:]=minIndex,minDist**2
            for j in range(self.k):#计算新的质心(求每个簇内每列的平均值)
                pointsInCluster = self.data[clusterAssment[:, 0] == j,:]
                centroids[j,:]=np.mean(pointsInCluster,axis=0)
        return centroids,clusterAssment

    def ptlshow(self,k):
        m=np.shape(self.data)[0]
        colormap = np.array(['red', 'lime', 'black'])
        plt.scatter(self.data[:,0],self.data[:,1],c="r")
        plt.show()#聚类之前
        centroids,clusterAssment=self.km()
        mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr']
        for i in range(m):
            markIndex = int(clusterAssment[i, 0])
            plt.plot(self.data[i, 0], self.data[i, 1], mark[markIndex])

        mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb']
        for i in range(k):
            plt.plot(centroids[i, 0], centroids[i, 1], mark[i])
        plt.grid(color='#95a5a6', linestyle='--', linewidth=1, axis='both', alpha=0.4)
        plt.show()

        print(clusterAssment,m,sum(clusterAssment[:,1]),sum(clusterAssment[:,1])/m)



data=datasets.load_iris()
#data_1=data["data"][:,:2]#sepal length sepal width
#data_2=data["data"][:,2:]#petal length petal width
data_3=data["data"][:,(0,2)]
k=Kmeans(data_3,3)
k.km()
k.ptlshow(3)
"""
k=Kmeans(data_1,3)
k.km()
k.ptlshow(3)

k=Kmeans(data_2,3)
k.km()
k.ptlshow(3)

print("特征名字:",data["feature_names"])
print("特征值:",data["target"])
print("目标值的名字:",data["target_names"])
"""

Sklearn:

import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
import sklearn.metrics as sm
import pandas as pd
import numpy as np

class Kmeans():
    def __init__(self,data):
        self.data=data
        self.print_data("数据:", self.data["data"])
        self.print_data("特征名字:",self.data["feature_names"])
        self.print_data("目标值:",self.data["target"])
        self.print_data("目标值得名字:",self.data["target_names"])
        self.data = pd.DataFrame(self.data["data"])  # 将data数据转换为pandas库数据库结构,并设立列的名字
        self.data.columns = ['Sepal_Length', 'Sepal_Width', 'Petal_Length',
                     'Petal_Width']  # 将数据的名字设为‘Sepal_Length’,‘Sepal_Width’,‘Sepal_Width’,‘Petal_Width’
        # 将target数据转换为pandas库数据库结构
        self.y = pd.DataFrame(data["target"])
        self.y.columns = ['Targets']#改变特征字段名字
        #创建色板
        self.colormap=np.array(["red","lime","black"])


    def print_data(self,str,data):
        print(str,data)
        print("="*85)

    def show_data(self,x,y,title):
        plt.figure(figsize=(14,7))#创建画布
        plt.scatter(x,y,c=self.colormap[self.y.Targets],s=40)
        plt.title(title)
        plt.show()

    def km(self):
       # 开始画Sepal长度和宽度的关系
       self.show_data(self.data.Sepal_Length, self.data.Sepal_Width, title='Sepal')
       # 开始画Petal长度和宽度的关系
       self.show_data(self.data.Petal_Length, self.data.Petal_Width, title='Petal')
       # 开始画聚类前Petal长度和Sepal长度的关系
       self.show_data(self.data.Sepal_Length, self.data.Petal_Length, title='Lenght')
       X=self.data.ix[:,['Sepal_Length', 'Petal_Length']]#设置要聚类的字段
       estimator=KMeans(n_clusters=3)
       estimator.fit(X)
       label_pred=estimator.labels_
       x0=self.data[label_pred==0]#0类
       x1=self.data[label_pred==1]#1类
       x2=self.data[label_pred==2]#2类
       plt.rc("font",family="STXihei",size=10)
       plt.scatter(x0['Sepal_Length'], x0['Petal_Length'], 50, color='red', marker='+', linewidth=2, alpha=0.8)
       plt.scatter(x1['Sepal_Length'], x1['Petal_Length'], 50, color='yellow', marker='+', linewidth=2, alpha=0.8)
       plt.scatter(x2['Sepal_Length'], x2['Petal_Length'], 50, color='blue', marker='+', linewidth=2, alpha=0.8)
       plt.xlabel('Sepal_Length')
       plt.ylabel('Sepal_Width')
       plt.xlim(4, 10)
       plt.grid(color='#95a5a6', linestyle='--', linewidth=1, axis='both', alpha=0.4)
       plt.show()
       # 计算并输出准确率
       print('the accuracy is:', sm.accuracy_score(self.y, estimator.labels_))

data=datasets.load_iris()#加载数据
km=Kmeans(data)
km.km()
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值