这是最后一个GUI,主要是k-中心聚类算法的实现,之后要总结一下经典学习算法了
import pandas as pd
from pylab import mpl
import numpy as np
mpl.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
import matplotlib.pyplot as plt
%matplotlib inline
test=pd.read_excel('data/2017年南区污水试验检测结果统计表(周检).xlsx')
test.head()
id | 取样地点 | 取样日期 | 编号 | 化学需氧量 | 氨氮 | 总氮 | 总磷 | 锰 | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 南区污水 | 2017-05-22 | 170522WST01 | 1.21 | 0.025 | 0.05 | 0.030 | 0.01 |
1 | 2 | 南区污水 | 2017-05-22 | 170522WST02 | 360.00 | 0.091 | 50.50 | 0.910 | 0.01 |
2 | 3 | 南区污水 | 2017-06-01 | 170601WST03 | 20.70 | 0.025 | 2.03 | 0.006 | 0.01 |
3 | 4 | 南区污水 | 2017-06-01 | 170601WST04 | 39.60 | 0.605 | 27.60 | 2.430 | 0.01 |
4 | 5 | 南区科研 | 2017-06-05 | 170605WST04 | 45.80 | 0.312 | 21.70 | 0.983 | 0.01 |
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 9 columns):
id 18 non-null int64
取样地点 18 non-null object
取样日期 18 non-null datetime64[ns]
编号 18 non-null object
化学需氧量 18 non-null float64
氨氮 18 non-null float64
总氮 18 non-null float64
总磷 18 non-null float64
锰 18 non-null float64
dtypes: datetime64[ns](1), float64(5), int64(1), object(2)
memory usage: 1.3+ KB
test.hist(figsize=(20,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x00000151DEEE1588>,
<matplotlib.axes._subplots.AxesSubplot object at 0x00000151DEF36828>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E0FF1BA8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E1022278>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E1046908>,
<matplotlib.axes._subplots.AxesSubplot object at 0x00000151E1046940>]],
dtype=object)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-x29sFPzn-1577516969567)(output_4_1.png)]
test['取样地点'].unique()
array(['南区污水', '南区科研', '反渗透水', '超滤出水', '超滤进水', '超滤产水'], dtype=object)
X = test.drop(['取样地点','编号','取样日期'],axis=1)
通过聚类算法验证是否可行
def randomCenter(data, k):
'''
随机初始化聚类中心
:param data: 训练数据
:param k: 聚类中心的个数
:return: 返回初始化的聚类中心
'''
n = np.shape(data)[1] # 特征的个数
cent = np.mat(np.zeros((k, n))) # 初始化K个聚类中心
for j in range(n): # 初始化聚类中心每一维的坐标
minJ = np.min(data[:, j])
rangeJ = np.max(data[:, j]) - minJ
cent[:, j] = minJ * np.mat(np.ones((k, 1))) + np.random.rand(k, 1) * rangeJ # 在最大值和最小值之间初始化
return cent
def kmeans(data, k, cent):
'''
kmeans算法求解聚类中心
:param data: 训练数据
:param k: 聚类中心的个数
:param cent: 随机初始化的聚类中心
:return: 返回训练完成的聚类中心和每个样本所属的类别
'''
m, n = np.shape(data) # m:样本的个数;n:特征的维度
subCenter = np.mat(np.zeros((m, 2))) # 初始化每个样本所属的类别
change = True # 判断是否需要重新计算聚类中心
while change == True:
change = False # 重置
for i in range(m):
minDist = np.inf # 设置样本与聚类中心的最小距离,初始值为正无穷
minIndex = 0 # 所属的类别
for j in range(k):
# 计算i和每个聚类中心的距离
dist = distance(data[i, ], cent[j, ])
if dist < minDist:
minDist = dist
minIndex = j
# 判断是否需要改变
if subCenter[i, 0] != minIndex: # 需要改变
change = True
subCenter[i, ] = np.mat([minIndex, minDist])
# 重新计算聚类中心
for j in range(k):
sum_all = np.mat(np.zeros((1, n)))
r = 0 # 每个类别中样本的个数
for i in range(m):
if subCenter[i, 0] == j: # 计算第j个类别
sum_all += data[i, ]
r += 1
for z in range(n):
try:
cent[j, z] = sum_all[0, z] / r
except:
print("ZeroDivisionError: division by zero")
return subCenter, cent
def save_result(file_name, data):
'''
保存source中的结果到file_name文件中
:param file_name: 保存的文件名
:param data: 需要保存的数据
:return:
'''
m, n = np.shape(data)
f = open(file_name, "w")
for i in range(m):
tmp = []
for j in range(n):
tmp.append(str(data[i, j]))
f.write("\t".join(tmp) + "\n")
f.close()
def distance(vecA, vecB):
'''
计算两个向量之间欧氏距离的平方
:param vecA: 向量A的坐标
:param vecB: 向量B的坐标
:return: 返回两个向量之间欧氏距离的平方
'''
dist = (vecA - vecB) * (vecA - vecB).T
return dist[0, 0]
if __name__ == "__main__":
k = 4# 聚类中心的个数
file_path = np.array(X)
subCenter, center = kmeans((file_path), k, randomCenter((file_path), k))
D:\sofewore\anaconda\lib\site-packages\ipykernel_launcher.py:98: RuntimeWarning: invalid value encountered in double_scalars
subCenters = np.array(subCenter)
for i,j in enumerate(subCenters):
print('第{}个变量属于第{}类'.format(i,j[0]))
第0个变量属于第2.0类
第1个变量属于第0.0类
第2个变量属于第2.0类
第3个变量属于第2.0类
第4个变量属于第2.0类
第5个变量属于第2.0类
第6个变量属于第3.0类
第7个变量属于第2.0类
第8个变量属于第2.0类
第9个变量属于第2.0类
第10个变量属于第2.0类
第11个变量属于第2.0类
第12个变量属于第2.0类
第13个变量属于第2.0类
第14个变量属于第2.0类
第15个变量属于第2.0类
第16个变量属于第2.0类
第17个变量属于第2.0类
k中心聚类算法
import numpy as np
import random
def kMedoids(D, k, tmax=100):
# determine dimensions of distance matrix D
m, n = D.shape
if k > n:
raise Exception('too many medoids')
# find a set of valid initial cluster medoid indices since we
# can't seed different clusters with two points at the same location
valid_medoid_inds = set(range(n))
invalid_medoid_inds = set([])
rs,cs = np.where(D==0)
# the rows, cols must be shuffled because we will keep the first duplicate below
index_shuf = list(range(len(rs)))
np.random.shuffle(index_shuf)
rs = rs[index_shuf]
cs = cs[index_shuf]
for r,c in zip(rs,cs):
# if there are two points with a distance of 0...
# keep the first one for cluster init
if r < c and r not in invalid_medoid_inds:
invalid_medoid_inds.add(c)
valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)
if k > len(valid_medoid_inds):
raise Exception('too many medoids (after removing {} duplicate points)'.format(
len(invalid_medoid_inds)))
# randomly initialize an array of k medoid indices
M = np.array(valid_medoid_inds)
np.random.shuffle(M)
M = np.sort(M[:k])
# create a copy of the array of medoid indices
Mnew = np.copy(M)
# initialize a dictionary to represent clusters
C = {}
for t in range(tmax):
# determine clusters, i. e. arrays of data indices
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# update cluster medoids
for kappa in range(k):
J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
j = np.argmin(J)
Mnew[kappa] = C[kappa][j]
np.sort(Mnew)
# check for convergence
if np.array_equal(M, Mnew):
break
M = np.copy(Mnew)
else:
# final update of cluster memberships
J = np.argmin(D[:,M], axis=1)
for kappa in range(k):
C[kappa] = np.where(J==kappa)[0]
# return results
return M, C
# # coding: utf-8
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
# 3 points in dataset
# data = np.array(X)
# # distance matrix
# D = pairwise_distances(data, metric='euclidean')
# # split into 2 clusters
# M, C = kMedoids(D, 4)
# print('medoids:')
# for point_idx in M:
# print( data[point_idx] )
# print('')
# print('clustering result:')
# for label in C:
# for point_idx in C[label]:
# print('第{0}类: 第{1}个变量'.format(label, data[point_idx][0]))
from tkinter import *
import tkinter.filedialog
def run1():
b = int(inp1.get())
test=pd.read_excel('data/2017年南区污水试验检测结果统计表(周检).xlsx')
X = test.drop(['取样地点','编号','取样日期'],axis=1)
a=[]
k = b# 聚类中心的个数
file_path = np.array(X)
print(file_path)
subCenter, center = kmeans((file_path), k, randomCenter((file_path), k))
subCenters = np.array(subCenter)
data={}
for i,j in enumerate(subCenters):
data[i]=[int(j[0])]
# a.append(int(j[0]),i)
# a.append('\n')
# a=str(a).replace('{','')
txt2.insert(END,data)
def xz():
filename=tkinter.filedialog.askopenfilename()
if filename != '':
lb4.config(text='您选择的文件是'+filename)
raw = pd.read_excel(filename)
txt.insert(END,raw)
else:
lb4.config(text='您没有选择任何文件')
def cop():
def run2():
b = int(inp12.get())
test=pd.read_excel('data/2017年南区污水试验检测结果统计表(周检).xlsx')
X = test.drop(['取样地点','编号','取样日期'],axis=1)
txt12.insert(END,X)
data = np.array(X)
D = pairwise_distances(data, metric='euclidean')
M, C = kMedoids(D,b)
s={}
for label in C:
for point_idx in C[label]:
# a.append('第{0}类: 第{1}个变量'.format(label, data[point_idx][0]))
# a.append('\n')
s[int(data[point_idx][0])]=[label]
txt22.insert(END,s)
def xz2():
filename=tkinter.filedialog.askopenfilename()
if filename != '':
lb42.config(text='您选择的文件是'+filename)
raw = pd.read_excel(filename)
txt12.insert(END,raw)
else:
lb42.config(text='您没有选择任何文件')
winNew = Toplevel(root)
winNew.geometry('800x700')
winNew.title('大数据课程设计')
lb2 = Label(winNew, text='k-中心数据聚类系统',font=('黑体',22,'bold'))
lb2.place(relx=0.12, rely=0.05, relwidth=0.8, relheight=0.1)
lb2 = Label(winNew, text='输入簇数k的值')
lb2.place(relx=0.4, rely=0.2)
inp12 = Entry(winNew)
inp12.place(relx=0.52, rely=0.2,relwidth=0.15, relheight=0.05)
txt12 = Text(winNew)
txt12.place(relx=0.1,rely=0.3, relheight=0.6,relwidth=0.4)
txt22 = Text(winNew)
txt22.place(relx=0.55,rely=0.3, relheight=0.6,relwidth=0.5)
mainmenu = Menu(winNew)
menuFile = Menu(mainmenu) # 菜单分组 menuFile
mainmenu.add_cascade(label="文件",menu=menuFile)
# menuFile.add_separator() # 分割线
menuEdit = Menu(mainmenu) # 菜单分组 menuEdit
# mainmenu.add_cascade(label="算法",menu=menuEdit)
# menuEdit.add_command(label="K-means",command=cut)
# menuEdit.add_command(label="k-中心",command=cop)
# btClose=Button(winNew,text='关闭',command=winNew.destroy)
# btClose.place(relx=0.7,rely=0.5)
btn1 = Button(winNew, text='开始计算', command=run2)
btn1.place(relx=0.7, rely=0.2, relwidth=0.12, relheight=0.05)
btn2=Button(winNew,text='弹出文件选择对话框',command=xz2)
btn2.place(relx=0.1, rely=0.2)
lb42 = Label(winNew,text='')
lb42.place(relx=0.1, rely=0.05)
def cut():
root2 = Tk()
root2.geometry('800x700')
root.title('大数据课程设计')
# lb1 = Label(root,text='显示信息',font=('黑体',32,'bold'))
# lb1.place(relx=0.2,rely=0.2)
lb2 = Label(root, text='K-means数据聚类系统',font=('黑体',22,'bold'))
lb2.place(relx=0.12, rely=0.05, relwidth=0.8, relheight=0.1)
lb2 = Label(root, text='输入簇数k的值')
lb2.place(relx=0.1, rely=0.2)
inp1 = Entry(root)
inp1.place(relx=0.22, rely=0.2,relwidth=0.15, relheight=0.05)
txt = Text(root)
txt.place(relx=0.1,rely=0.3, relheight=0.6,relwidth=0.3)
txt2 = Text(root)
txt2.place(relx=0.5,rely=0.3, relheight=0.6,relwidth=0.45)
def popupmenu(event):
mainmenu.post(event.x_root,event.y_root)
root = Tk()
root.geometry('800x700')
root.title('大数据课程设计')
# lb1 = Label(root,text='显示信息',font=('黑体',32,'bold'))
# lb1.place(relx=0.2,rely=0.2)
lb2 = Label(root, text='K-means数据聚类系统',font=('黑体',22,'bold'))
lb2.place(relx=0.12, rely=0.05, relwidth=0.8, relheight=0.1)
lb2 = Label(root, text='输入簇数k的值')
lb2.place(relx=0.4, rely=0.2)
inp1 = Entry(root)
inp1.place(relx=0.52, rely=0.2,relwidth=0.15, relheight=0.05)
txt = Text(root)
txt.place(relx=0.1,rely=0.3, relheight=0.6,relwidth=0.42)
txt2 = Text(root)
txt2.place(relx=0.55,rely=0.3, relheight=0.6,relwidth=0.4)
btn1 = Button(root, text='开始计算', command=run1)
btn1.place(relx=0.7, rely=0.2, relwidth=0.12, relheight=0.05)
btn2=Button(root,text='弹出文件选择对话框',command=xz)
btn2.place(relx=0.1, rely=0.2)
lb4 = Label(root,text='')
lb4.place(relx=0.1, rely=0.05)
mainmenu = Menu(root)
menuFile = Menu(mainmenu) # 菜单分组 menuFile
# mainmenu.add_cascade(label="文件",menu=menuFile)
# menuFile.add_separator() # 分割线
# menuFile.add_command(label="打开",command=ope)
menuEdit = Menu(mainmenu) # 菜单分组 menuEdit
mainmenu.add_cascade(label="算法",menu=menuEdit)
# menuEdit.add_command(label="K-means",command=cut)
menuEdit.add_command(label="k-中心",command=cop)
root.config(menu=mainmenu)
root.bind('Button-3',popupmenu) # 根窗体绑定鼠标右击响应事件
root.mainloop()
[[1.00e+00 1.21e+00 2.50e-02 5.00e-02 3.00e-02 1.00e-02]
[2.00e+00 3.60e+02 9.10e-02 5.05e+01 9.10e-01 1.00e-02]
[3.00e+00 2.07e+01 2.50e-02 2.03e+00 6.00e-03 1.00e-02]
[4.00e+00 3.96e+01 6.05e-01 2.76e+01 2.43e+00 1.00e-02]
[5.00e+00 4.58e+01 3.12e-01 2.17e+01 9.83e-01 1.00e-02]
[6.00e+00 1.62e+00 2.50e-02 2.68e+00 3.36e-02 1.00e-02]
[7.00e+00 1.71e+02 5.50e-01 5.94e+01 2.15e+00 1.00e-02]
[8.00e+00 8.47e+00 7.20e-02 2.41e+00 1.36e-02 1.00e-02]
[9.00e+00 3.35e+01 3.04e-01 2.09e+01 2.44e-01 1.00e-02]
[1.00e+01 5.10e+01 1.17e+00 4.23e+01 8.23e-01 1.00e-02]
[1.10e+01 2.63e+00 3.21e-01 2.46e+00 1.50e-02 1.00e-02]
[1.20e+01 2.13e+01 1.47e-01 1.81e+01 2.40e-01 1.00e-02]
[1.30e+01 2.03e+01 2.21e-01 1.84e+01 1.97e-01 1.00e-02]
[1.40e+01 5.06e+01 4.12e-01 3.64e+01 8.34e-01 1.00e-02]
[1.50e+01 2.35e+01 2.54e-01 2.31e+01 3.39e-01 1.00e-02]
[1.60e+01 2.17e+01 2.07e-01 2.22e+01 3.16e-01 1.00e-02]
[1.70e+01 2.56e+00 3.40e-02 2.66e+00 2.40e-02 1.00e-02]
[1.80e+01 4.91e+01 3.79e-01 4.43e+01 1.15e+00 1.60e-02]]
test=pd.read_excel('data/2017年南区污水试验检测结果统计表(周检).xlsx')
X = test.drop(['取样地点','编号','取样日期'],axis=1)
k = 3# 聚类中心的个数
file_path = np.array(X)
print(file_path)
subCenter, center = kmeans((file_path), k, randomCenter((file_path), 3))
subCenters = np.array(subCenter)
print(subCenters[0])
data={}
for i,j in enumerate(subCenters):
data[i]=[int(j[0])]