基类实现
# _*_ coding: utf-8 _*_
# @Date : 2023/3/14 18:47
# @Author : Paul
# @File : classifiers.py
# @Description :
import pandas as pd
import io
import matplotlib.pyplot as plt
from core.utils.string_utils import StringUtils
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from core.algo.base_algo import BaseAlgo
from core.data_source.meta_data_source.meta_data_source import MetaDataSource
from core.utils.data_souce_init_utils import DataSourceInitUtil
from core.utils.date_util import DateUtil
class Classifier(BaseAlgo):
def __init__(self,
app_name="clusters",
data_source_id=None,
table_name=None,
feature_cols=None,
class_col=None,
train_size=None,
param = None
):
"""
初始化类
:param app_name:
:param data_source_id:
:param table_name:
:param train_cols:
"""
super(Classifier, self).__init__(app_name=app_name)
self.param = param
# 开始时间
self.start_time = DateUtil.getCurrentDate()
self.table_name = table_name
self.feature_cols = feature_cols
self.class_col = class_col
self.train_size = train_size
self.all_col = self.feature_cols + self.class_col
self.clf = None
# 数据的摘要概要
self.info = None
# 数据统计学估计
self.describe = None
# 数据二维分布图
self.two_dim_dis_image = self.image_path + "two_dim_dis_" + app_name + "_" + DateUtil.getCurrentDateSimple() + ".png"
# 预测效果分布图
self.classifier_pred_image = self.image_path + "cluster_pred_" + app_name + "_" + DateUtil.getCurrentDateSimple() + ".png"
# 获取元数据库
self.meta_data_source = MetaDataSource()
# 获取训练集所在的数据源
self.data_source = DataSourceInitUtil.getDataBase(self.meta_data_source,
data_source_id)
self.labels = None
self.train_data_ratio = float(self.param["trainDataRatio"])
def getModelData(self):
"""
获取建模数据:输出训练集、测试集
:return:
"""
data_query_sql = "select {} from {}".format(",".join(self.all_col),
self.table_name)
data = self.data_source.queryAll(data_query_sql)
data = pd.DataFrame(data=data,
columns = self.all_col)
# 数据的简要摘要
buf = io.StringIO() # 创建一个StringIO,便于后续在内存中写入str
data.info(buf=buf) # 写入
self.info = buf.getvalue() # 读取
# 统计学估计
self.describe = data.describe()
# 获取预处理策略值
process_method_list_after_process = []
self.param.get("preProcessMethodList")[0].get("preProcessFeature")
process_method_list = self.param.get("preProcessMethodList")
if len(process_method_list) > 0:
for process_method in process_method_list:
if process_method == None or process_method == "null":
continue
pre_process_feature = process_method.get("preProcessFeature")
if StringUtils.isBlack(pre_process_feature):
continue
else:
process_method_list_after_process.append(process_method)
self.param["preProcessMethodList"] = process_method_list_after_process
if len(process_method_list_after_process) > 0:
for process_method in process_method_list_after_process:
pre_process_feature = process_method.get("preProcessFeature")
preProcessMethod = process_method.get("preProcessMethod")
preProcessMethodValue = process_method.get("preProcessMethodValue")
#1.删除填充值
if preProcessMethod == "deletena":
data.drop(pre_process_feature, inplace=True, axis=1)
#2.替换缺失值
elif preProcessMethod == "fillna":
if preProcessMethodValue == "mean":
imp_mean = SimpleImputer()
data[pre_process_feature] = imp_mean.fit_transform(data[pre_process_feature].values.reshape(-1,1))
elif preProcessMethodValue == "median":
imp_median = SimpleImputer(strategy="median")
data[pre_process_feature] = imp_median.fit_transform(data[pre_process_feature].values.reshape(-1,1))
elif preProcessMethodValue == "most_frequent":
imp_mode = SimpleImputer(strategy="most_frequent")
data[pre_process_feature] = imp_mode.fit_transform(data[pre_process_feature].values.reshape(-1,1))
elif preProcessMethodValue == "constant_0":
imp_0 = SimpleImputer(strategy="constant", fill_value=0)
data[pre_process_feature] = imp_0.fit_transform(data[pre_process_feature].values.reshape(-1,1))
elif preProcessMethodValue == "constant_1":
imp_1 = SimpleImputer(strategy="constant", fill_value=1)
data[pre_process_feature] = imp_1.fit_transform(data[pre_process_feature].values.reshape(-1,1))
# 3.分类变量转换为数值变量
elif preProcessMethod == "transClassFeature":
unique_value = data[pre_process_feature].unique().tolist()
data[pre_process_feature] = data[pre_process_feature].apply(lambda x: unique_value.index(x))
# 4.类型转换
elif preProcessMethod == "transType":
if preProcessMethodValue == "int":
data[pre_process_feature] = data[pre_process_feature].astype("int")
elif preProcessMethodValue == "float":
data[pre_process_feature] = data[pre_process_feature].astype("float")
# # 处理缺失值,对缺失值较多的列进行填补,有一些特征只确实一两个值,可以采取直接删除记录的方法
# data["Age"] = data["Age"].fillna(data["Age"].mean())
# # 将分类变量转换为数值型变量
# # 将二分类变量转换为数值型变量
# # astype能够将一个pandas对象转换为某种类型,和apply(int(x))不同,astype可以将文本类转换为数字,用这个方式可以很便捷地将二分类特征转换为0~1
# data["Sex"] = (data["Sex"] == "male").astype("int")
# # 将三分类变量转换为数值型变量
# self.labels = data["Embarked"].unique().tolist()
# data["Embarked"] = data["Embarked"].apply(lambda x: self.labels.index(x))
X = data.iloc[:, data.columns != self.class_col[0]]
Y = data.iloc[:, data.columns == self.class_col[0]]
# 数据无纲量化策略
standardization = self.param["standardization"]
if standardization == "MinMaxScaler":
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
elif standardization == "StandardScaler":
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=self.train_data_ratio)
return [Xtrain, Ytrain], [Xtest, Ytest]
if __name__ == '__main__':
cluster = Classifier(app_name="cluster_demo",
data_source_id=9,
table_name="titanic",
feature_cols=["Survived", "Pclass", "Sex", "Age", "Cabin"],
class_col=["Embarked"],
train_size=0.7)
cluster.getModelData()
决策树算法实现
# _*_ coding: utf-8 _*_
# @Date : 2023/3/14 18:47
# @Author : Paul
# @File : decision_tree.py
# @Description : 决策树算法
import matplotlib.pyplot as plt
from core.beans.param_train_result import ParamTrainResult
from core.utils.log_util import LogUtil
from core.beans.classifier_result import ClassifierResult
from core.utils.date_util import DateUtil
from sklearn.tree import DecisionTreeClassifier
from classifiers.classifier import Classifier
import sys
import json
from core.utils.string_utils import StringUtils
class TWDecisionTree(Classifier):
def __init__(self,
app_name="clusters",
data_source_id=None,
table_name=None,
feature_cols=None,
class_col=None,
train_size=None,
param = None
):
"""
初始化
:param app_name:
:param data_source_id:
:param table_name:
:param feature_cols:
:param class_col:
:param train_size:
"""
super(TWDecisionTree, self).__init__(app_name=app_name,
data_source_id=data_source_id,
table_name=table_name,
feature_cols=feature_cols,
class_col=class_col,
train_size=train_size,
param=param)
self.IS_MODEL_EVAL = True # 默认:不需要评估模型
# 预测效果分布图
self.tree_pred_image = "descion_tree_pred_" + app_name + "_" + DateUtil.getCurrentDateSimple()
def initModel(self):
"""
初始化模型
"""
algoParam = self.param["algoParam"]
criterion = "gini" if StringUtils.isBlack(algoParam["criterion"]) else algoParam["criterion"]
randomState = None if StringUtils.isBlack(algoParam["randomState"]) else int(algoParam["randomState"])
splitter = "best" if StringUtils.isBlack(algoParam["splitter"]) else algoParam["splitter"]
maxDepth = None if StringUtils.isBlack(algoParam["maxDepth"]) else int(algoParam["maxDepth"])
minSamplesSplit = 2 if StringUtils.isBlack(algoParam["minSamplesSplit"]) else int(algoParam["minSamplesSplit"])
minSamplesLeaf = 1 if StringUtils.isBlack(algoParam["minSamplesLeaf"]) else int(algoParam["minSamplesLeaf"])
self.clf = DecisionTreeClassifier(criterion=criterion,
random_state=randomState,
splitter=splitter,
max_depth=maxDepth,
min_samples_split=minSamplesSplit,
min_samples_leaf=minSamplesLeaf)
def buildModel(self, train_data):
"""
训练模型
"""
Xtrain = train_data[0]
Ytrain = train_data[1]
self.clf = self.clf.fit(Xtrain, Ytrain)
def evalModel(self, train_data, test_data):
"""
评估模型
"""
Xtest = test_data[0]
Ytest = test_data[1]
score_ = self.clf.score(Xtest, Ytest)
var_importance = [*zip(self.feature_cols, self.clf.feature_importances_)]
import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(self.clf,
feature_names=self.feature_cols,
class_names=self.labels,
filled=True,
rounded=True)
graph = graphviz.Source(dot_data)
graph.view(filename=self.tree_pred_image, directory=self.image_path)
# graph.save(self.cluster_pred_image)
# 结束时间
end_time = DateUtil.getCurrentDate()
cost_second = DateUtil.diffMin(self.start_time, end_time)
# 模型结果存入mysql
algo_result = ClassifierResult(self.param["id"],
"decision_tree",
self.param,
self.app_name,
self.info,
self.describe,
self.image_path + self.tree_pred_image + ".pdf",
var_importance,
score_,
"sucess",
self.start_time,
end_time,
cost_second)
LogUtil.saveClassifierResult(self.meta_data_source, algo_result)
def paramTrain(self):
"""
超参数训练
:return:
"""
# 获取超参数训练参数
param_train = self.param["paramTrain"]
param_name = None if StringUtils.isBlack(param_train["paramName"]) else str(param_train["paramName"])
param_start_value = None if StringUtils.isBlack(param_train["paramStartValue"]) else int(param_train["paramStartValue"])
param_end_value = None if StringUtils.isBlack(param_train["paramEndValue"]) else int(param_train["paramEndValue"])
param_range_value = None if StringUtils.isBlack(param_train["paramRangeValue"]) else int(param_train["paramRangeValue"])
#测试数据
train_data, test_data = self.getModelData()
Xtrain = train_data[0]
Ytrain = train_data[1]
Xtest = test_data[0]
Ytest = test_data[1]
eval_value_list = []
if param_name == None or param_start_value is None or param_end_value is None or param_range_value is None or param_start_value==1:
error_info = "请确认参数必须为整数,且参数起始值不能为1"
# 结束时间
end_time = DateUtil.getCurrentDate()
cost_second = DateUtil.diffMin(self.start_time, end_time)
# 模型结果存入mysql
param_train_result = ParamTrainResult(self.param["id"],
"decision_tree",
self.param,
self.app_name,
error_info,
"failed",
self.start_time,
end_time,
cost_second)
LogUtil.saveParamTrainResult(self.meta_data_source, param_train_result)
elif param_name == "max_depth":
for max_depth in range(param_start_value, param_end_value, param_range_value):
self.clf = DecisionTreeClassifier(max_depth=max_depth)
self.clf = self.clf.fit(Xtrain, Ytrain)
score_ = self.clf.score(Xtest, Ytest)
eval_value_list.append(score_)
# 保存结果
param_train_image = self.image_path + "descion_tree_param_train_" + DateUtil.getCurrentDateSimple() + ".png"
fig, ax = plt.subplots(1, 1)
ax.set_title("最大数据深度--准确率--超参数学习曲线")
ax.plot([i for i in range(param_start_value, param_end_value, param_range_value)], eval_value_list)
plt.savefig(param_train_image, dpi=300)
# plt.show()
# 结束时间
end_time = DateUtil.getCurrentDate()
cost_second = DateUtil.diffMin(self.start_time, end_time)
# 模型结果存入mysql
param_train_result = ParamTrainResult(self.param["id"],
"decision_tree",
self.param,
self.app_name,
param_train_image,
"success",
self.start_time,
end_time,
cost_second)
LogUtil.saveParamTrainResult(self.meta_data_source, param_train_result)
if __name__ == '__main__':
argv = sys.argv[1]
# argv = "{\"algoParam\":{\"criterion\":\"gini\",\"maxDepth\":\"\",\"minSamplesLeaf\":\"\",\"minSamplesSplit\":\"\",\"randomState\":\"\",\"splitter\":\"best\"},\"appName\":\"kmeans_1\",\"classCols\":\"Survived\",\"dataSourceId\":\"9\",\"featureCols\":\"Pclass,Age,Sex,SibSp,Parch,Fare,Embarked\",\"id\":\"1679300369558\",\"preProcessMethodList\":[{\"preProcessMethod\":\"entropy\"}],\"tableName\":\"titanic\"}";
param = json.loads(argv)
app_name = param["appName"]
data_source_id = param["dataSourceId"]
table_name = param["tableName"]
feature_cols = param["featureCols"]
class_cols = param["classCols"]
train_size = float(param["trainDataRatio"])
class_cols_list = []
if isinstance(class_cols, list):
class_cols_list = class_cols
else:
class_cols_list.append(class_cols)
classifier = TWDecisionTree(app_name=app_name,
data_source_id=data_source_id,
table_name=table_name,
feature_cols=feature_cols,
class_col=class_cols_list,
train_size=0.7,
param=param)
if "paramTrain" not in param.keys():
classifier.execute()
else:
classifier.paramTrain()
详细代码见gitee
twinkle_algo_plat: 晓烁算法平台算法端