基类实现
# _*_ coding: utf-8 _*_
# @Date : 2023/3/20 23:43
# @Author : Paul
# @File : regression.py
# @Description :
import pandas as pd
import io
import matplotlib.pyplot as plt
from core.utils.string_utils import StringUtils
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from core.algo.base_algo import BaseAlgo
from core.data_source.meta_data_source.meta_data_source import MetaDataSource
from core.utils.data_souce_init_utils import DataSourceInitUtil
from core.utils.date_util import DateUtil
class Regression(BaseAlgo):
def __init__(self,
app_name="clusters",
data_source_id=None,
table_name=None,
feature_cols=None,
class_col=None,
train_size=None,
param = None
):
"""
初始化类
:param app_name:
:param data_source_id:
:param table_name:
:param train_cols:
"""
super(Regression, self).__init__(app_name=app_name)
self.param = param
# 开始时间
self.start_time = DateUtil.getCurrentDate()
self.table_name = table_name
self.feature_cols = feature_cols
self.class_col = class_col
self.train_size = train_size
self.all_col = self.feature_cols + self.class_col
# 模型类
self.reg = None
# 数据的摘要概要
self.info = None
# 数据统计学估计
self.describe = None
# 预测效果分布图
self.regress_pred_image = self.image_path + "regress_pred_" + app_name + "_" + DateUtil.getCurrentDateSimple() + ".png"
# 获取元数据库
self.meta_data_source = MetaDataSource()
# 获取训练集所在的数据源
self.data_source = DataSourceInitUtil.getDataBase(self.meta_data_source,
data_source_id)
self.labels = None
self.train_data_ratio = float(self.param["trainDataRatio"])
def getModelData(self):
"""
获取建模数据:输出训练集、测试集
:return:
"""
data_query_sql = "select {} from {}".format(",".join(self.all_col),
self.table_name)
data = self.data_source.queryAll(data_query_sql)
data = pd.DataFrame(data=data,
columns = self.all_col)
# 数据的简要摘要
buf = io.StringIO() # 创建一个StringIO,便于后续在内存中写入str
data.info(buf=buf) # 写入
self.info = buf.getvalue() # 读取
# 统计学估计
self.describe = data.describe()
# 获取预处理策略值
process_method_list_after_process = []
self.param.get("preProcessMethodList")[0].get("preProcessFeature")
process_method_list = self.param.get("preProcessMethodList")
if len(process_method_list) > 0:
for process_method in process_method_list:
if process_method == None or process_method == "null":
continue
pre_process_feature = process_method.get("preProcessFeature")
if StringUtils.isBlack(pre_process_feature):
continue
else:
process_method_list_after_process.append(process_method)
self.param["preProcessMethodList"] = process_method_list_after_process
if len(process_method_list_after_process) > 0:
for process_method in process_method_list_after_process:
pre_process_feature = process_method.get("preProcessFeature")
preProcessMethod = process_method.get("preProcessMethod")
preProcessMethodValue = process_method.get("preProcessMethodValue")
#1.删除填充值
if preProcessMethod == "deletena":
data.drop(pre_process_feature, inplace=True, axis=1)
#2.替换缺失值
elif preProcessMethod == "fillna":
if preProcessMethodValue == "mean":
imp_mean = SimpleImputer()
data[pre_process_feature] = imp_mean.fit_transform(data[pre_process_feature].values.reshape(-1,1))
elif preProcessMethodValue == "median":
imp_median = SimpleImputer(strategy="median")
data[pre_process_feature] = imp_median.fit_transform(data[pre_process_feature].values.reshape(-1,1))
elif preProcessMethodValue == "most_frequent":
imp_mode = SimpleImputer(strategy="most_frequent")
data[pre_process_feature] = imp_mode.fit_transform(data[pre_process_feature].values.reshape(-1,1))
elif preProcessMethodValue == "constant_0":
imp_0 = SimpleImputer(strategy="constant", fill_value=0)
data[pre_process_feature] = imp_0.fit_transform(data[pre_process_feature].values.reshape(-1,1))
elif preProcessMethodValue == "constant_1":
imp_1 = SimpleImputer(strategy="constant", fill_value=1)
data[pre_process_feature] = imp_1.fit_transform(data[pre_process_feature].values.reshape(-1,1))
# 3.分类变量转换为数值变量
elif preProcessMethod == "transClassFeature":
unique_value = data[pre_process_feature].unique().tolist()
data[pre_process_feature] = data[pre_process_feature].apply(lambda x: unique_value.index(x))
# 4.类型转换
elif preProcessMethod == "transType":
if preProcessMethodValue == "int":
data[pre_process_feature] = data[pre_process_feature].astype("int")
elif preProcessMethodValue == "float":
data[pre_process_feature] = data[pre_process_feature].astype("float")
X = data.iloc[:, data.columns != self.class_col[0]]
Y = data.iloc[:, data.columns == self.class_col[0]]
# 数据无纲量化策略
standardization = self.param["standardization"]
if standardization == "MinMaxScaler":
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
elif standardization == "StandardScaler":
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, train_size=self.train_data_ratio)
return [Xtrain, Ytrain], [Xtest, Ytest]
if __name__ == '__main__':
regression = Regression(app_name="cluster_demo",
data_source_id=9,
table_name="titanic",
feature_cols=["Survived", "Pclass", "Sex", "Age", "Cabin"],
class_col=["Embarked"],
train_size=0.7)
regression.getModelData()
线性回归代码实现
# _*_ coding: utf-8 _*_
# @Date : 2023/3/22 17:25
# @Author : Paul
# @File : linear_regression.py
# @Description :
import matplotlib.pyplot as plt
import numpy as np
import sys
import json
from core.utils.log_util import LogUtil
from core.beans.regress_result import RegressionResult
from core.utils.string_utils import StringUtils
from core.utils.date_util import DateUtil
from sklearn.model_selection import cross_val_score
from regressions.regression import Regression
from sklearn.linear_model import LinearRegression as LR
class TWLinearRegression(Regression):
def __init__(self,
app_name="linear_regression",
data_source_id=None,
table_name=None,
feature_cols=None,
class_col=None,
train_size=None,
param=None
):
"""
初始化
:param app_name:
:param data_source_id:
:param table_name:
:param feature_cols:
:param class_col:
:param train_size:
"""
super(TWLinearRegression, self).__init__(app_name=app_name,
data_source_id=data_source_id,
table_name=table_name,
feature_cols=feature_cols,
class_col=class_col,
train_size=train_size,
param=param)
self.IS_MODEL_EVAL = True # 默认:不需要评估模型
# 预测效果分布图
self.tree_pred_image = "descion_tree_pred_" + app_name + "_" + DateUtil.getCurrentDateSimple()
# 画图数据条数
self.plot_rows_num = 200
def initModel(self):
"""
初始化模型
"""
algoParam = self.param["algoParam"]
fit_intercept = True if StringUtils.isBlack(algoParam["fitIntercept"]) else bool(algoParam["fitIntercept"])
normalize = False if StringUtils.isBlack(algoParam["normalize"]) else bool(algoParam["normalize"])
copy_X = True if StringUtils.isBlack(algoParam["copyX"]) else bool(algoParam["copyX"])
n_jobs = None if StringUtils.isBlack(algoParam["nJobs"]) else int(algoParam["nJobs"])
self.reg = LR(fit_intercept=fit_intercept,
normalize=normalize,
copy_X=copy_X,
n_jobs=n_jobs)
def buildModel(self, train_data):
"""
训练模型
"""
Xtrain = train_data[0]
Ytrain = train_data[1]
self.reg = self.reg.fit(Xtrain, Ytrain)
def evalModel(self, train_data, test_data):
"""
评估模型
"""
Xtest = test_data[0]
Ytest = test_data[1]
Ytest_predict = self.reg.predict(Xtest)
from sklearn.metrics import mean_squared_error as MSE
self.score_ = np.sqrt(MSE(Ytest, Ytest_predict))
var_importance = {}
for i in range(len(self.feature_cols)):
var_importance[self.feature_cols[i]] = self.reg.coef_[0][i]
test_data_rows = len(Xtest)
plt.figure()
if test_data_rows < self.plot_rows_num:
plt.plot(np.linspace(0.05, 1, test_data_rows), Ytest, "green", label="Y-真实值")
plt.plot(np.linspace(0.05, 1, test_data_rows), Ytest_predict, "red", label="Y-预测值")
else:
plt.plot(np.linspace(0.05, 1, self.plot_rows_num), Ytest[:self.plot_rows_num], "green", label="Y-真实值")
plt.plot(np.linspace(0.05, 1, self.plot_rows_num), Ytest_predict[:self.plot_rows_num], "red", label="Y-预测值")
plt.legend()
plt.savefig(self.regress_pred_image, dpi=300)
plt.show()
# 结束时间
end_time = DateUtil.getCurrentDate()
cost_second = DateUtil.diffMin(self.start_time, end_time)
# 模型结果存入mysql
algo_result = RegressionResult(self.param["id"],
"linear_regression",
self.param,
self.app_name,
self.info,
self.describe,
self.regress_pred_image,
json.dumps(var_importance).replace("\"", "'"),
self.score_,
"sucess",
self.start_time,
end_time,
cost_second)
LogUtil.saveRegressionResult(self.meta_data_source, algo_result)
if __name__ == '__main__':
argv = sys.argv[1]
# argv = "{\"algoParam\":{\"copyX\":\"true\",\"fitIntercept\":\"true\",\"nJobs\":\"\",\"normalize\":\"true\"},\"appName\":\"kmeans_1\",\"classCols\":\"housePrice\",\"dataSourceId\":\"9\",\"featureCols\":\"MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude\",\"id\":\"1679540469249\",\"preProcessMethodList\":[{\"preProcessMethod\":\"deletena\"}],\"tableName\":\"california_housing\",\"trainDataRatio\":\"0.7\"}"
param = json.loads(argv)
app_name = param["appName"]
data_source_id = param["dataSourceId"]
table_name = param["tableName"]
feature_cols = param["featureCols"]
class_cols = param["classCols"]
train_size = float(param["trainDataRatio"])
class_cols_list = []
if isinstance(class_cols, list):
class_cols_list = class_cols
else:
class_cols_list.append(class_cols)
classifier = TWLinearRegression(app_name=app_name,
data_source_id=data_source_id,
table_name=table_name,
feature_cols=feature_cols,
class_col=class_cols_list,
train_size=train_size,
param=param)
if "paramTrain" not in param.keys():
classifier.execute()
else:
classifier.paramTrain()