20191202_k-中心聚类算法和k-mean算法Python实现

最新推荐文章于 2024-07-06 20:26:50 发布

Happy丶lazy

最新推荐文章于 2024-07-06 20:26:50 发布

阅读量1.3k

点赞数

分类专栏：接单文章标签： kmeans算法 k中心聚类算法数据挖掘

本文链接：https://blog.csdn.net/qq_39309652/article/details/103744828

版权

接单专栏收录该内容

35 篇文章 4 订阅

订阅专栏

这是最后一个GUI，主要是k-中心聚类算法的实现，之后要总结一下经典学习算法了
在这里插入图片描述

import pandas as pd
from pylab import mpl
import numpy as np
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
import matplotlib.pyplot as plt
%matplotlib inline

test=pd.read_excel('data/2017年南区污水试验检测结果统计表(周检).xlsx')

test.head()

	id	取样地点	取样日期	编号	化学需氧量	氨氮	总氮	总磷	锰
0	1	南区污水	2017-05-22	170522WST01	1.21	0.025	0.05	0.030	0.01
1	2	南区污水	2017-05-22	170522WST02	360.00	0.091	50.50	0.910	0.01
2	3	南区污水	2017-06-01	170601WST03	20.70	0.025	2.03	0.006	0.01
3	4	南区污水	2017-06-01	170601WST04	39.60	0.605	27.60	2.430	0.01
4	5	南区科研	2017-06-05	170605WST04	45.80	0.312	21.70	0.983	0.01

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 9 columns):
id       18 non-null int64
取样地点     18 non-null object
取样日期     18 non-null datetime64[ns]
编号       18 non-null object
化学需氧量    18 non-null float64
氨氮       18 non-null float64
总氮       18 non-null float64
总磷       18 non-null float64
锰        18 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(1), object(2)
memory usage: 1.3+ KB

test.hist(figsize=(20,10))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000151DEEE1588>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000151DEF36828>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E0FF1BA8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000151E1022278>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E1046908>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x00000151E1046940>]],
      dtype=object)

在这里插入图片描述
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-x29sFPzn-1577516969567)(output_4_1.png)]

test['取样地点'].unique()

array(['南区污水', '南区科研', '反渗透水', '超滤出水', '超滤进水', '超滤产水'], dtype=object)

X = test.drop(['取样地点','编号','取样日期'],axis=1)

通过聚类算法验证是否可行

def randomCenter(data, k):
    '''
    随机初始化聚类中心

    :param data: 训练数据

    :param k: 聚类中心的个数

    :return: 返回初始化的聚类中心

    '''
    n = np.shape(data)[1]  # 特征的个数

    cent = np.mat(np.zeros((k, n)))  # 初始化K个聚类中心

    for j in range(n):  # 初始化聚类中心每一维的坐标

        minJ = np.min(data[:, j])

        rangeJ = np.max(data[:, j]) - minJ

        cent[:, j] = minJ * np.mat(np.ones((k, 1))) + np.random.rand(k, 1) * rangeJ  # 在最大值和最小值之间初始化

    return cent

def kmeans(data, k, cent):

    '''

    kmeans算法求解聚类中心

    :param data: 训练数据

    :param k: 聚类中心的个数

    :param cent: 随机初始化的聚类中心

    :return: 返回训练完成的聚类中心和每个样本所属的类别

    '''

    m, n = np.shape(data)  # m：样本的个数；n：特征的维度

    subCenter = np.mat(np.zeros((m, 2)))  # 初始化每个样本所属的类别

    change = True  # 判断是否需要重新计算聚类中心

    while change == True:

        change = False  # 重置

        for i in range(m):

            minDist = np.inf  # 设置样本与聚类中心的最小距离，初始值为正无穷

            minIndex = 0  # 所属的类别

            for j in range(k):

                # 计算i和每个聚类中心的距离

                dist = distance(data[i, ], cent[j, ])

                if dist < minDist:

                    minDist = dist

                    minIndex = j

            # 判断是否需要改变

            if subCenter[i, 0] != minIndex:  # 需要改变

                change = True

                subCenter[i, ] = np.mat([minIndex, minDist])

        # 重新计算聚类中心

        for j in range(k):

            sum_all = np.mat(np.zeros((1, n)))

            r = 0  # 每个类别中样本的个数

            for i in range(m):

                if subCenter[i, 0] == j:  # 计算第j个类别

                    sum_all += data[i, ]

                    r += 1

            for z in range(n):

                try:

                    cent[j, z] = sum_all[0, z] / r

                except:

                    print("ZeroDivisionError: division by zero")

    return subCenter, cent

def save_result(file_name, data):

    '''

    保存source中的结果到file_name文件中

    :param file_name: 保存的文件名

    :param data: 需要保存的数据

    :return:

    '''

    m, n = np.shape(data)

    f = open(file_name, "w")

    for i in range(m):

        tmp = []

        for j in range(n):

            tmp.append(str(data[i, j]))

        f.write("\t".join(tmp) + "\n")

    f.close()

def distance(vecA, vecB):

    '''

    计算两个向量之间欧氏距离的平方

    :param vecA: 向量A的坐标

    :param vecB: 向量B的坐标

    :return: 返回两个向量之间欧氏距离的平方

    '''

    dist = (vecA - vecB) * (vecA - vecB).T

    return dist[0, 0]

if __name__ == "__main__":

    k = 4# 聚类中心的个数

    file_path = np.array(X)

    subCenter, center = kmeans((file_path), k, randomCenter((file_path), k))

D:\sofewore\anaconda\lib\site-packages\ipykernel_launcher.py:98: RuntimeWarning: invalid value encountered in double_scalars

subCenters = np.array(subCenter)
for i,j in enumerate(subCenters):
    print('第{}个变量属于第{}类'.format(i,j[0]))

第0个变量属于第2.0类
第1个变量属于第0.0类
第2个变量属于第2.0类
第3个变量属于第2.0类
第4个变量属于第2.0类
第5个变量属于第2.0类
第6个变量属于第3.0类
第7个变量属于第2.0类
第8个变量属于第2.0类
第9个变量属于第2.0类
第10个变量属于第2.0类
第11个变量属于第2.0类
第12个变量属于第2.0类
第13个变量属于第2.0类
第14个变量属于第2.0类
第15个变量属于第2.0类
第16个变量属于第2.0类
第17个变量属于第2.0类

k中心聚类算法

import numpy as np
import random

def kMedoids(D, k, tmax=100):
    # determine dimensions of distance matrix D
    m, n = D.shape

    if k > n:
        raise Exception('too many medoids')

    # find a set of valid initial cluster medoid indices since we
    # can't seed different clusters with two points at the same location
    valid_medoid_inds = set(range(n))
    invalid_medoid_inds = set([])
    rs,cs = np.where(D==0)
    # the rows, cols must be shuffled because we will keep the first duplicate below
    index_shuf = list(range(len(rs)))
    np.random.shuffle(index_shuf)
    rs = rs[index_shuf]
    cs = cs[index_shuf]
    for r,c in zip(rs,cs):
        # if there are two points with a distance of 0...
        # keep the first one for cluster init
        if r < c and r not in invalid_medoid_inds:
            invalid_medoid_inds.add(c)
    valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)

    if k > len(valid_medoid_inds):
        raise Exception('too many medoids (after removing {} duplicate points)'.format(
            len(invalid_medoid_inds)))

    # randomly initialize an array of k medoid indices
    M = np.array(valid_medoid_inds)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # create a copy of the array of medoid indices
    Mnew = np.copy(M)

    # initialize a dictionary to represent clusters
    C = {}
    for t in range(tmax):
        # determine clusters, i. e. arrays of data indices
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # update cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # check for convergence
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # final update of cluster memberships
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]

    # return results
    return M, C

# # coding: utf-8
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
# 3 points in dataset
# data = np.array(X)
# # distance matrix
# D = pairwise_distances(data, metric='euclidean')
# # split into 2 clusters
# M, C = kMedoids(D, 4)

# print('medoids:')
# for point_idx in M:
#     print( data[point_idx] )

# print('')
# print('clustering result:')
# for label in C:
#     for point_idx in C[label]:
#         print('第{0}类:　第{1}个变量'.format(label, data[point_idx][0]))

from tkinter import *
import tkinter.filedialog
def run1():
    b = int(inp1.get())
    test=pd.read_excel('data/2017年南区污水试验检测结果统计表(周检).xlsx')
    X = test.drop(['取样地点','编号','取样日期'],axis=1)
    a=[]
    k = b# 聚类中心的个数
    file_path = np.array(X)
    print(file_path)
    subCenter, center = kmeans((file_path), k, randomCenter((file_path), k))
    subCenters = np.array(subCenter)
    data={}
    for i,j in enumerate(subCenters):
        data[i]=[int(j[0])]
#         a.append(int(j[0]),i)
#         a.append('\n')
#     a=str(a).replace('{','')
    txt2.insert(END,data)
def xz():
    filename=tkinter.filedialog.askopenfilename()
    if filename != '':
        lb4.config(text='您选择的文件是'+filename)
        raw = pd.read_excel(filename)
        txt.insert(END,raw)
    else:
         lb4.config(text='您没有选择任何文件')

def cop():
    def run2():
        b = int(inp12.get())
        test=pd.read_excel('data/2017年南区污水试验检测结果统计表(周检).xlsx')
        X = test.drop(['取样地点','编号','取样日期'],axis=1)
        txt12.insert(END,X)
        data = np.array(X)
        D = pairwise_distances(data, metric='euclidean')
        M, C = kMedoids(D,b)
        s={}
        for label in C:
            for point_idx in C[label]:
#                 a.append('第{0}类:　第{1}个变量'.format(label, data[point_idx][0]))
#                 a.append('\n')
                s[int(data[point_idx][0])]=[label]
        txt22.insert(END,s)
    def xz2():
        filename=tkinter.filedialog.askopenfilename()
        if filename != '':
            lb42.config(text='您选择的文件是'+filename)
            raw = pd.read_excel(filename)
            txt12.insert(END,raw)
        else:
             lb42.config(text='您没有选择任何文件')
    winNew = Toplevel(root)
    winNew.geometry('800x700')
    winNew.title('大数据课程设计')
    lb2 = Label(winNew, text='k-中心数据聚类系统',font=('黑体',22,'bold'))
    lb2.place(relx=0.12, rely=0.05, relwidth=0.8, relheight=0.1)
    lb2 = Label(winNew, text='输入簇数k的值')
    lb2.place(relx=0.4, rely=0.2)
    inp12 = Entry(winNew)
    inp12.place(relx=0.52, rely=0.2,relwidth=0.15, relheight=0.05)
    txt12 = Text(winNew)
    txt12.place(relx=0.1,rely=0.3, relheight=0.6,relwidth=0.4)
    txt22 = Text(winNew)
    txt22.place(relx=0.55,rely=0.3, relheight=0.6,relwidth=0.5)
    mainmenu = Menu(winNew)
    menuFile = Menu(mainmenu)  # 菜单分组 menuFile
    mainmenu.add_cascade(label="文件",menu=menuFile)
    # menuFile.add_separator()  # 分割线
    menuEdit = Menu(mainmenu)  # 菜单分组 menuEdit
#     mainmenu.add_cascade(label="算法",menu=menuEdit)
#     menuEdit.add_command(label="K-means",command=cut)
#     menuEdit.add_command(label="k-中心",command=cop)
#     btClose=Button(winNew,text='关闭',command=winNew.destroy)
#     btClose.place(relx=0.7,rely=0.5)
    btn1 = Button(winNew, text='开始计算', command=run2)
    btn1.place(relx=0.7, rely=0.2, relwidth=0.12, relheight=0.05)
    btn2=Button(winNew,text='弹出文件选择对话框',command=xz2)
    btn2.place(relx=0.1, rely=0.2)
    lb42 = Label(winNew,text='')
    lb42.place(relx=0.1, rely=0.05)
def cut():
    root2 = Tk()
    root2.geometry('800x700')
    root.title('大数据课程设计')
    # lb1 = Label(root,text='显示信息',font=('黑体',32,'bold'))
    # lb1.place(relx=0.2,rely=0.2)
    lb2 = Label(root, text='K-means数据聚类系统',font=('黑体',22,'bold'))
    lb2.place(relx=0.12, rely=0.05, relwidth=0.8, relheight=0.1)
    lb2 = Label(root, text='输入簇数k的值')
    lb2.place(relx=0.1, rely=0.2)
    inp1 = Entry(root)
    inp1.place(relx=0.22, rely=0.2,relwidth=0.15, relheight=0.05)
    txt = Text(root)
    txt.place(relx=0.1,rely=0.3, relheight=0.6,relwidth=0.3)
    txt2 = Text(root)
    txt2.place(relx=0.5,rely=0.3, relheight=0.6,relwidth=0.45)
def popupmenu(event):
     mainmenu.post(event.x_root,event.y_root)
root = Tk()
root.geometry('800x700')
root.title('大数据课程设计')
# lb1 = Label(root,text='显示信息',font=('黑体',32,'bold'))
# lb1.place(relx=0.2,rely=0.2)
lb2 = Label(root, text='K-means数据聚类系统',font=('黑体',22,'bold'))
lb2.place(relx=0.12, rely=0.05, relwidth=0.8, relheight=0.1)
lb2 = Label(root, text='输入簇数k的值')
lb2.place(relx=0.4, rely=0.2)
inp1 = Entry(root)
inp1.place(relx=0.52, rely=0.2,relwidth=0.15, relheight=0.05)
txt = Text(root)
txt.place(relx=0.1,rely=0.3, relheight=0.6,relwidth=0.42)
txt2 = Text(root)
txt2.place(relx=0.55,rely=0.3, relheight=0.6,relwidth=0.4)
btn1 = Button(root, text='开始计算', command=run1)
btn1.place(relx=0.7, rely=0.2, relwidth=0.12, relheight=0.05)
btn2=Button(root,text='弹出文件选择对话框',command=xz)
btn2.place(relx=0.1, rely=0.2)
lb4 = Label(root,text='')
lb4.place(relx=0.1, rely=0.05)
mainmenu = Menu(root)
menuFile = Menu(mainmenu)  # 菜单分组 menuFile
# mainmenu.add_cascade(label="文件",menu=menuFile)
# menuFile.add_separator()  # 分割线
# menuFile.add_command(label="打开",command=ope)
menuEdit = Menu(mainmenu)  # 菜单分组 menuEdit
mainmenu.add_cascade(label="算法",menu=menuEdit)
# menuEdit.add_command(label="K-means",command=cut)
menuEdit.add_command(label="k-中心",command=cop)
root.config(menu=mainmenu)
root.bind('Button-3',popupmenu) # 根窗体绑定鼠标右击响应事件
root.mainloop()

[[1.00e+00 1.21e+00 2.50e-02 5.00e-02 3.00e-02 1.00e-02]
 [2.00e+00 3.60e+02 9.10e-02 5.05e+01 9.10e-01 1.00e-02]
 [3.00e+00 2.07e+01 2.50e-02 2.03e+00 6.00e-03 1.00e-02]
 [4.00e+00 3.96e+01 6.05e-01 2.76e+01 2.43e+00 1.00e-02]
 [5.00e+00 4.58e+01 3.12e-01 2.17e+01 9.83e-01 1.00e-02]
 [6.00e+00 1.62e+00 2.50e-02 2.68e+00 3.36e-02 1.00e-02]
 [7.00e+00 1.71e+02 5.50e-01 5.94e+01 2.15e+00 1.00e-02]
 [8.00e+00 8.47e+00 7.20e-02 2.41e+00 1.36e-02 1.00e-02]
 [9.00e+00 3.35e+01 3.04e-01 2.09e+01 2.44e-01 1.00e-02]
 [1.00e+01 5.10e+01 1.17e+00 4.23e+01 8.23e-01 1.00e-02]
 [1.10e+01 2.63e+00 3.21e-01 2.46e+00 1.50e-02 1.00e-02]
 [1.20e+01 2.13e+01 1.47e-01 1.81e+01 2.40e-01 1.00e-02]
 [1.30e+01 2.03e+01 2.21e-01 1.84e+01 1.97e-01 1.00e-02]
 [1.40e+01 5.06e+01 4.12e-01 3.64e+01 8.34e-01 1.00e-02]
 [1.50e+01 2.35e+01 2.54e-01 2.31e+01 3.39e-01 1.00e-02]
 [1.60e+01 2.17e+01 2.07e-01 2.22e+01 3.16e-01 1.00e-02]
 [1.70e+01 2.56e+00 3.40e-02 2.66e+00 2.40e-02 1.00e-02]
 [1.80e+01 4.91e+01 3.79e-01 4.43e+01 1.15e+00 1.60e-02]]

test=pd.read_excel('data/2017年南区污水试验检测结果统计表(周检).xlsx')
X = test.drop(['取样地点','编号','取样日期'],axis=1)
k = 3# 聚类中心的个数
file_path = np.array(X)
print(file_path)
subCenter, center = kmeans((file_path), k, randomCenter((file_path), 3))
subCenters = np.array(subCenter)
print(subCenters[0])
data={}
for i,j in enumerate(subCenters):
    data[i]=[int(j[0])]

Happy丶lazy

关注

0
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
20191202_k-中心聚类算法和k-mean算法Python实现

这是最后一个GUI，主要是k-中心聚类算法的实现，之后要总结一下经典学习算法了import pandas as pdfrom pylab import mplimport numpy as npmpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体mpl.rcParams['axes.unicode_minus'] = False...
复制链接

扫一扫

专栏目录