数据集
模型训练
import sklearn.metrics as skm
from sklearn import cluster #聚类包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
data=pd.read_excel("数据\\01.聚类数据1.xlsx")
data.index=data.姓名
data_select=data.iloc[:,1:]
data_select=data_select.apply(lambda x:(x-min(x))/(max(x)-min(x)),axis=0)
from sklearn import cluster
model=cluster.KMeans(2,max_iter=10)
result=model.fit(data_select)
print("1.分类结果标签:",result.labels_)# 分类结果标签
print("2.各特征各分组的中心点值:",result.cluster_centers_)# 各特征各分组的中心点值
print("*3.ch 指标vrc =F值=ssa/sse:",skm.calinski_harabasz_score(data_select,result.labels_)) #ch指标得vrc =F值=ssa/sse 越大越好
print("*4.SI(轮廓系数,越大分得越好) :",skm.silhouette_score(data_select,result.labels_))
print("*5.sse (组内距离,簇内距离,越小越好): ",result.inertia_)
print("6.SI(每个点数据轮廓系数,越大分得越好) :",skm.silhouette_samples(data_select,result.labels_))
print("7.存储模型:",joblib.dump(model,'2.模型保存\\01.聚类及参数模型.pkl'))
# print("读取",joblib.load('2.模型保存\\01.聚类及参数模型.pkl'))
train_model=joblib.load('2.模型保存\\01.聚类及参数模型.pkl')
print("结果:",train_model.fit(data_select).labels_)
print("预测推断:",train_model.predict([[0.000000,0.363636,1.000000] ]))
out:
1.分类结果标签: [1 0 1 1 0 1]
2.各特征各分组的中心点值: [[0.14285714 0.18181818 1. ]
[0.625 0.70454545 0.41666667]]
*3.ch 指标vrc =F值=ssa/sse: 3.164640105724566
*4.SI(轮廓系数,越大分得越好) : 0.30853316791257973
*5.sse (组内距离,簇内距离,越小越好): 1.425726420044602
6.SI(每个点数据轮廓系数,越大分得越好) : [0.13635519 0.56654 0.00488626 0.21229708 0.59024083 0.34087965]
7.存储模型: ['2.模型保存\\01.聚类及参数模型.pkl']
结果: [1 0 1 1 0 1]
预测推断: [0]