Kmeans2Pmml.py
# -*- coding:utf-8 -*-
import pandas
from sklearn.model_selection import train_test_split
import numpy as np # 导入numpy库
import matplotlib.pyplot as plt # 导入matplotlib库
from sklearn.cluster import KMeans # 导入sklearn聚类模块
from sklearn import metrics # 导入sklearn效果评估模块
import random
from sklearn.decomposition import PCA
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml
def make_result_pic(x_train,silhouette_s,pic_kmeans,data_view,n_clusters,result_pic):
# 为方便模型可视化将元数据降维成2维
reduced_data = PCA(n_components=2).fit_transform(x_train)
print(len(reduced_data))
pic_kmeans.fit(reduced_data)
x_pre=pic_kmeans.predict(reduced_data)
dict = {}
for key in x_pre:
dict[key] = dict.get(key, 0) + 1
# print(dict)
# 模型效果可视化
centers = pic_kmeans.cluster_centers_ # 各类别中心
color_list = [] # 颜色列表
for index in range(n_clusters):
R = round(random.uniform(0, 1), 4)
G = round(random.uniform(0, 1), 4)
B = round(random.uniform(0, 1), 4)
cur_color = (R, G, B)
color_list.append(cur_color)
plt.figure(figsize=(10, 10)) # 创建画布
plt.subplot(2, 2, 1) # 第一个子网格
for i in range(n_clusters): # 循环读类别
index_sets = np.where(x_pre == i) # 找到相同类的索引集合
cluster = reduced_data[index_sets] # 将相同类的数据划分为一个聚类子集
plt.scatter(cluster[:, 0], cluster[:, 1], c=color_list[i], marker='.') # 展示聚类子集内的样本点
plt.plot(centers[i][0], centers[i][1], 'o', markerfacecolor=color_list[i], markeredgecolor='k',
markersize=6) # 展示各聚类子集的中心
# 子网格3:
plt.subplot(2, 2, 2) # 第二个子网格
plt.axis('off')
plt.title('silhouette_s:' + str(silhouette_s)+" "+data_view, loc='center') # 子网格标题
# 子网格2:
plt.subplot(2, 2, 3) # 第二个子网格
plt.axis('off')
plt.title('distribution:' +str(dict), loc='center') # 子网格标题
# 自动调整绘图区的大小及间距
fig = plt.gcf()
fig.tight_layout()
fig.savefig(result_pic)
plt.close()
def main(modelName,trainingFilePath,code,k_test_size,k_clusters,k_random_state,k_max_iter):
# 评估结果生成路径
result_pic = str(modelName).replace("pmml", "png")
iris_df = pandas.read_csv(trainingFilePath, encoding=code)
columns = iris_df.columns.tolist()
# 默认第一列为行号 最后一列为标签列
first_colName = columns[0];
feature_list = (iris_df.columns.difference([first_colName])).tolist()
X = iris_df[iris_df.columns.difference([first_colName])]
# 按照比例将数据分成训练集和测试集
x_train, x_test= train_test_split(X, test_size=k_test_size, random_state=0);
data_view = "total: " + str(len(iris_df)) + " train:" + str(len(x_train)) + " test:" + str(len(x_test))
print(data_view)
if len(columns) < 3:
print("columnNum error")
exit(1)
else:
print("check success")
from sklearn2pmml.pipeline import PMMLPipeline
model_kmeans = KMeans(n_clusters=k_clusters, random_state=k_random_state, max_iter=k_max_iter) # 建立聚类模型对象
pic_kmeans = KMeans(n_clusters=k_clusters, random_state=k_random_state, max_iter=k_max_iter) # 建立聚类模型对象
pipeline = PMMLPipeline([
("classifier", model_kmeans)
])
pipeline.fit(X) # 训练聚类模型
y_pre = pipeline.predict(X) # 预测聚类模型
# 模型效果指标评估
silhouette_s = metrics.silhouette_score(X, y_pre, metric='euclidean') # 平均轮廓系数
silhouette_s=round(silhouette_s,4)
#绘制结果
make_result_pic(x_train,silhouette_s,pic_kmeans,data_view,k_clusters,result_pic)
sklearn2pmml(pipeline, modelName, with_repr=True)
kmeans_main.py
# -*- coding:utf-8 -*-
import sys
import kmeans_pmml.Kmeans2Pmml as m
#sys.argv[0] 为脚本自身
#m.main(sys.argv[1],sys.argv[2])
from util import codingUtil
try:
# modelName ="E:/data/out/kmeans.pmml"
# trainingFilePath = "E:/data/cluster2.csv"
# k_clusters=3
# k_random_state=None
# k_max_iter=200
# k_test_size = 0.2
modelName=sys.argv[1]
trainingFilePath=sys.argv[2]
# 默认 8
k_clusters = int(sys.argv[3])
if sys.argv[4]=="None":
k_random_state =None
else :
k_random_state =int(sys.argv[4])
k_max_iter = int(sys.argv[5])
k_test_size = float(sys.argv[6])
code = codingUtil.file_encoding(trainingFilePath)
m.main(modelName,trainingFilePath,code,k_test_size,k_clusters,k_random_state,k_max_iter)
except Exception as e:
print('Exception :\t\t', str(e))
聚类效果图 ,数据分布,聚合分布情况
总体代码参考
文章来源-宋天龙.《Python数据分析与数据化运营》
但是案例里面的为两个特征的数据刚好能画聚合效果图
实际可能为多个特征 所以需要先降维再去绘图