基类实现
# _*_ coding: utf-8 _*_
# @Date : 2023/3/11 16:57
# @Author : Paul
# @File : clusters.py
# @Description : 聚类算法基类
import pandas as pd
import io
import matplotlib.pyplot as plt
from core.utils.string_utils import StringUtils
from sklearn.impute import SimpleImputer
from core.algo.base_algo import BaseAlgo
from core.data_source.meta_data_source.meta_data_source import MetaDataSource
from core.utils.data_souce_init_utils import DataSourceInitUtil
from core.utils.date_util import DateUtil
class Cluster(BaseAlgo):
def __init__(self,
app_name="clusters",
data_source_id=None,
table_name=None,
feature_cols=None,
param=None,
):
"""
初始化类
:param app_name:
:param data_source_id:
:param table_name:
:param feature_cols:
"""
# 开始时间
self.start_time = DateUtil.getCurrentDate()
super(Cluster, self).__init__(app_name=app_name)
self.table_name = table_name
self.feature_cols = feature_cols
# 数据的摘要概要
self.info = None
# 数据统计学估计
self.describe = None
# 数据二维分布图
self.two_dim_dis_image = self.image_path + "two_dim_dis_" + app_name + "_" + DateUtil.getCurrentDateSimple() + ".png"
# 预测效果分布图
self.cluster_pred_image = self.image_path + "cluster_pred_" + app_name + "_" + DateUtil.getCurrentDateSimple() + ".png"
# 获取元数据库
self.meta_data_source = MetaDataSource()
# 获取训练集所在的数据源
self.data_source = DataSourceInitUtil.getDataBase(self.meta_data_source,
data_source_id)
# 降维后的数据集合
self.train_data_dr = None
self.param = param
def getModelData(self):
"""
获取建模数据:输出训练集、测试集
:return:
"""
data_query_sql = "select {} from {}".format(",".join(self.feature_cols),
self.table_name)
data = self.data_source.queryAll(data_query_sql)
data = pd.DataFrame(data=data,
columns = self.feature_cols)
#数据的简要摘要
buf = io.StringIO() # 创建一个StringIO,便于后续在内存中写入str
data.info(buf=buf) # 写入
self.info = buf.getvalue() # 读取
#统计学估计
self.describe = data.describe()
# 获取预处理策略值
process_method_list_after_process = []
process_method_list = self.param.get("preProcessMethodList")
if len(process_method_list) > 0:
for process_method in process_method_list:
if process_method == None or process_method == "null":
continue
pre_process_feature = process_method.get("preProcessFeature")
if StringUtils.isBlack(pre_process_feature):
continue
else:
process_method_list_after_process.append(process_method)
self.param["preProcessMethodList"] = process_method_list_after_process
if len(process_method_list_after_process) > 0:
for process_method in process_method_list_after_process:
pre_process_feature = process_method.get("preProcessFeature")
preProcessMethod = process_method.get("preProcessMethod")
preProcessMethodValue = process_method.get("preProcessMethodValue")
# 1.删除填充值
if preProcessMethod == "deletena":
# data.drop(pre_process_feature, inplace=True, axis=1)
data.dropna(subset=[pre_process_feature],
axis=0, # axis=0表示删除行;
how='any', # how=any表示若列name、age中,任意一个出现空值,就删掉该行
inplace=True # inplace=True表示在原df上进行修改;
)
# 2.替换缺失值
elif preProcessMethod == "fillna":
if preProcessMethodValue == "mean":
imp_mean = SimpleImputer()
data[pre_process_feature] = imp_mean.fit_transform(
data[pre_process_feature].values.reshape(-1, 1))
elif preProcessMethodValue == "median":
imp_median = SimpleImputer(strategy="median")
data[pre_process_feature] = imp_median.fit_transform(
data[pre_process_feature].values.reshape(-1, 1))
elif preProcessMethodValue == "most_frequent":
imp_mode = SimpleImputer(strategy="most_frequent")
data[pre_process_feature] = imp_mode.fit_transform(
data[pre_process_feature].values.reshape(-1, 1))
elif preProcessMethodValue == "constant_0":
imp_0 = SimpleImputer(strategy="constant", fill_value=0)
data[pre_process_feature] = imp_0.fit_transform(data[pre_process_feature].values.reshape(-1, 1))
elif preProcessMethodValue == "constant_1":
imp_1 = SimpleImputer(strategy="constant", fill_value=1)
data[pre_process_feature] = imp_1.fit_transform(data[pre_process_feature].values.reshape(-1, 1))
# 3.分类变量转换为数值变量
elif preProcessMethod == "transClassFeature":
unique_value = data[pre_process_feature].unique().tolist()
data[pre_process_feature] = data[pre_process_feature].apply(lambda x: unique_value.index(x))
# 4.类型转换
elif preProcessMethod == "transType":
if preProcessMethodValue == "int":
data[pre_process_feature] = data[pre_process_feature].astype("int")
elif preProcessMethodValue == "float":
data[pre_process_feature] = data[pre_process_feature].astype("float")
#生成数据二维分布图
if (len(self.feature_cols) == 2) :
fig, ax1 = plt.subplots(1)
ax1.scatter(data[self.feature_cols[0]],
data[self.feature_cols[1]],
marker="o",
s=15)
plt.savefig(self.two_dim_dis_image, dpi=300)
# plt.show()
elif (len(self.feature_cols) == 3):
ax = plt.subplot(projection='3d') # 创建一个三维的绘图工程
ax.set_title('3d_image_show') # 设置本图名称
ax.scatter(data[self.feature_cols[0]],
data[self.feature_cols[1]],
data[self.feature_cols[2]],
s=15,
marker="o",
c='r') # 绘制数据点 c: 'r'红色,'y'黄色,等颜色
plt.savefig(self.two_dim_dis_image, dpi=300)
# plt.show()
elif (len(self.feature_cols) > 3):
from sklearn.decomposition import PCA
self.train_data_dr = PCA(n_components=2).fit_transform(data)
ax = plt.subplot() # 创建一个三维的绘图工程
# ax.set_title('3d_image_show') # 设置本图名称
ax.scatter(self.train_data_dr[:, 0],
self.train_data_dr[:, 1],
# self.train_data_dr[:, 2],
s=15,
marker="o",
c='r') # 绘制数据点 c: 'r'红色,'y'黄色,等颜色
plt.savefig(self.two_dim_dis_image, dpi=300)
# plt.show()
# 数据无纲量化策略
standardization = self.param["standardization"]
if standardization == "MinMaxScaler":
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = scaler.fit_transform(data)
elif standardization == "StandardScaler":
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data = scaler.fit_transform(data)
return data, data
if __name__ == '__main__':
cluster = Cluster(app_name="cluster_demo", data_source_id=9, table_name="data1", feature_cols=["age", "income"])
cluster.getModelData()
Kmeans算法实现
# _*_ coding: utf-8 _*_
# @Date : 2023/3/10 21:03
# @Author : Paul
# @File : kmeans.py
# @Description : Kmeans聚类算法
import matplotlib.pyplot as plt
import sys
import json
from core.beans.param_train_result import ParamTrainResult
from core.utils.string_utils import StringUtils
from clusters.cluster import Cluster
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from core.beans.cluster_result import ClusterResult
from core.utils.color_util import ColorUtil
from core.utils.date_util import DateUtil
from core.utils.log_util import LogUtil
class TWKMeansAlgo(Cluster):
def __init__(self,
app_name="clusters",
data_source_id=None,
table_name=None,
feature_cols=None,
param = None
):
"""
初始化类
:param app_name:
:param data_source_id:
:param table_name:
:param feature_cols:
"""
super(TWKMeansAlgo, self).__init__(app_name=app_name,
data_source_id=data_source_id,
table_name=table_name,
feature_cols=feature_cols,
param=param)
self.cluser = None
# 聚类后质心
self.centroid = None
# 获取评估参数-轮廓函数
self.silhouette_score = None
self.IS_MODEL_EVAL = True # 默认:不需要评估模型
def initModel(self):
"""
初始化模型
"""
algoParam = self.param["algoParam"]
self.n_clusters = 8 if StringUtils.isBlack(algoParam["nClusters"]) else int(algoParam["nClusters"])
self.cluster = KMeans(n_clusters=self.n_clusters,
random_state=0)
def buildModel(self, train_data):
"""
训练模型
"""
self.cluster = self.cluster.fit(train_data)
# 重要属性lables_,查看聚好的类别,每个样本对应的类
y_pred = self.cluster.labels_
# 重要的参数cluster_centers_,查看质心
self.centroid = self.cluster.cluster_centers_
# 保存预测结果为图片
if (len(self.feature_cols) == 2):
fig, ax1 = plt.subplots(1)
for i in range(self.n_clusters):
ax1.scatter(train_data.iloc[y_pred == i, 0], train_data.iloc[y_pred == i, 1],
marker="o", # 点的形状
s=8, # 点的大小
c=ColorUtil.getRandomColor())
ax1.scatter(self.centroid[:, 0], self.centroid[:, 1],
marker="x",
s=15,
c="black")
plt.savefig(self.cluster_pred_image, dpi=300)
# plt.show()
elif (len(self.feature_cols) == 3):
ax = plt.subplot(projection='3d') # 创建一个三维的绘图工程
ax.set_title('3d_image_show') # 设置本图名称
for i in range(self.n_clusters):
ax.scatter(train_data.iloc[y_pred == i, 0], train_data.iloc[y_pred == i, 1], train_data.iloc[y_pred == i, 2],
marker="o", # 点的形状
s=15, # 点的大小
c=ColorUtil.getRandomColor())
plt.savefig(self.cluster_pred_image, dpi=300)
# plt.show()
elif (len(self.feature_cols) > 3):
ax = plt.subplot() # 创建一个三维的绘图工程
ax.set_title('3d_image_show') # 设置本图名称
for i in range(self.n_clusters):
ax.scatter(self.train_data_dr[y_pred == i, 0],
self.train_data_dr[y_pred == i, 1],
# self.train_data_dr[y_pred == i, 2],
marker="o", # 点的形状
s=15, # 点的大小
c=ColorUtil.getRandomColor())
plt.savefig(self.cluster_pred_image, dpi=300)
# plt.show()
def evalModel(self, train_data, test_data):
"""
评估模型
"""
# 获取评估参数-轮廓函数
self.silhouette_score = silhouette_score(train_data, self.cluster.labels_)
# 结束时间
end_time = DateUtil.getCurrentDate()
cost_second = DateUtil.diffMin(self.start_time, end_time)
# 模型结果存入mysql
classifier_result = ClusterResult(self.param["id"],
"kmeans",
self.param,
self.app_name,
self.info,
self.describe,
self.two_dim_dis_image,
self.cluster_pred_image,
self.centroid,
self.silhouette_score,
"success",
self.start_time,
end_time,
cost_second)
LogUtil.saveClusterResult(self.meta_data_source, classifier_result)
def paramTrain(self):
"""
超参数训练
:return:
"""
# 获取超参数训练参数
param_train = self.param["paramTrain"]
param_name = None if StringUtils.isBlack(param_train["paramName"]) else str(param_train["paramName"])
param_start_value = None if StringUtils.isBlack(param_train["paramStartValue"]) else int(param_train["paramStartValue"])
param_end_value = None if StringUtils.isBlack(param_train["paramEndValue"]) else int(param_train["paramEndValue"])
param_range_value = None if StringUtils.isBlack(param_train["paramRangeValue"]) else int(param_train["paramRangeValue"])
#测试数据
train_data, test_data = self.getModelData()
eval_value_list = []
if param_name == None or param_start_value is None or param_end_value is None or param_range_value is None or param_start_value==1:
error_info = "请确认参数必须为整数,且参数起始值不能为1"
# 结束时间
end_time = DateUtil.getCurrentDate()
cost_second = DateUtil.diffMin(self.start_time, end_time)
# 模型结果存入mysql
param_train_result = ParamTrainResult(self.param["id"],
"kmeans",
self.param,
self.app_name,
error_info,
"failed",
self.start_time,
end_time,
cost_second)
LogUtil.saveParamTrainResult(self.meta_data_source, param_train_result)
elif param_name == "n_clusters":
for n_clusters in range(param_start_value, param_end_value, param_range_value):
self.cluster = KMeans(n_clusters=n_clusters,
random_state=0)
self.cluster = self.cluster.fit(train_data)
silhouette_score_value = silhouette_score(train_data, self.cluster.labels_)
eval_value_list.append(silhouette_score_value)
# 保存结果
param_train_image = self.image_path + "cluster_param_train_" + DateUtil.getCurrentDateSimple() + ".png"
fig, ax = plt.subplots(1, 1)
ax.set_title("聚类个数--轮廓函数--超参数学习曲线")
ax.plot([i for i in range(param_start_value, param_end_value, param_range_value)], eval_value_list)
plt.savefig(param_train_image, dpi=300)
# plt.show()
# 结束时间
end_time = DateUtil.getCurrentDate()
cost_second = DateUtil.diffMin(self.start_time, end_time)
# 模型结果存入mysql
param_train_result = ParamTrainResult(self.param["id"],
"kmeans",
self.param,
self.app_name,
param_train_image,
"success",
self.start_time,
end_time,
cost_second)
LogUtil.saveParamTrainResult(self.meta_data_source, param_train_result)
if __name__ == '__main__':
argv = sys.argv[1]
# argv = "{\"algoParam\":{\"nClusters\":\"3\"},\"appName\":\"kmeans_1\",\"dataSourceId\":\"9\",\"featureCols\":[\"age\",\"income\",\"sex\"],\"id\":\"1680146960009\",\"preProcessMethodList\":[{\"preProcessFeature\":\"age\",\"preProcessMethod\":\"deletena\"}],\"standardization\":\"\",\"tableName\":\"data1\"}"
param = json.loads(argv)
app_name = param["appName"]
data_source_id = param["dataSourceId"]
table_name = param["tableName"]
feature_cols = param["featureCols"]
kmeans = TWKMeansAlgo(app_name=app_name,
data_source_id=data_source_id,
table_name=table_name,
feature_cols=feature_cols,
param = param)
if "paramTrain" not in param.keys():
kmeans.execute()
else:
kmeans.paramTrain()
详细代码建gitee
twinkle_algo_plat: 晓烁算法平台算法端