使用sklearn回归预测结果与测试集拟合曲线图绘制
以上文用过的随机森林模型为例,在函数中需要增加的有三项
1.使用测试集的X特征值算出你要预测的特征值存到y_project中(这个y_project取什么名字随便)
y_predict = model.predict(X_test)
2.在返回中加入两项
“predict”: y_predict,#为刚才的预测值
“y_test”: y_test#为测试集的想要特征值的真实值
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from application import logger
from application.utils import ModelScoreUtil
def train(X_data, y_data):
# 数据拆分: 训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=42)
# 创建模型并寻参
model = GridSearchCV(
estimator=RandomForestRegressor(),
param_grid={
'max_depth': range(20,40,5),
'n_estimators':range(200,600,100)
},
verbose=4,
n_jobs=6,
)
#
logger.info(f"训练模型:开始")
model.fit(X_train, y_train)
logger.info(f"训练模型:完成")
y_predict = model.predict(X_test)
#
model_score_check_result = ModelScoreUtil.check(model=model, X_test=X_test, y_test=y_test)
logger.info(f"模型评分结果: \n {pd.DataFrame([model_score_check_result])}")
# 返回数据
return {
"model": model,
"best_param": model.best_params_,
"model_score_check_result": model_score_check_result,
"predict": y_predict,
"y_test": y_test
}
pass
if __name__ == '__main__':
pass
再看主函数中怎么调用这两项predict和y_test画图
画图部分的代码如下
fig = plt.figure(figsize=(200, 3)) # dpi参数指定绘图对象的分辨率,即每英寸多少个像素,缺省值为80
axes = fig.add_subplot(1, 1, 1)#设置查看的子图
line1, = axes.plot(range(len(predict)), predict, 'b--', label='predict', linewidth=2)
line3, = axes.plot(range(len(predict)), y_test, 'g', label='true')
axes.grid()
fig.tight_layout()
plt.legend(handles=[line1, line3])
plt.title('拟合曲线')
plt.show()
#
pass
.plot(x, y, ls="-", lw=2, label=“plot figure”)
x: x轴上的数值
y: y轴上的数值
ls:折线图的线条风格
lw:折线图的线条宽度
label:标记图内容的标签文本
如对曲线line1
x:(range(len(predict) #为给预测值从1开始发的序号
y:predict #为预测值
颜色:b–
label:predict
linewideth:2
line1为特征值预测值的曲线,line3为实际值的曲线
line1, = axes.plot(range(len(predict)), predict, ‘b–’, label=‘predict’, linewidth=2)
line3, = axes.plot(range(len(predict)), y_test, ‘g’, label=‘true’)
plt.legend(handles=[line1, line3])
# encoding=utf-8
import datetime
import pickle
from application import logger, model_algorithm
from application.data_source.ds_model_station_supply_water_temper import query_model_station_supply_water_temper
from application.model_algorithm.outlier.iforest import isolation_forest
from application.utils import ModelStorePathUtil, MySQLUtils
import matplotlib.pyplot as plt
def create(model_name, station_id, start_time, end_time):
"""
创建模型:二次供水温度
:param model_name:
:param station_id:
:param start_time:
:param end_time:
:return:
"""
#
# 查询数据
data = query_model_station_supply_water_temper(station_id=station_id, start_time=start_time, end_time=end_time)
logger.info(f"二次供温历史数据:\n {data}")
logger.info(f"二次供温历史数据:\n {data.columns}")
#
# 数据过滤:字段
data = data[[
"pre_time",
"outside_temper",
"outside_weather",
#"outside_humidity",
"outside_wind",
"outside_wind_speed",
"average_inside_temper",
"supply_water_temper",
#"water_deviation_temper"
]]
# 数据集为空
if len(data) == 0:
# raise BaseException("没有可用的数据集")
return logger.info(f"没有可用的数据集:{model_name}, {station_id}, {start_time}, {end_time}")
pass
#
# 离群点数据:查找
iForest_index, outlier_label = isolation_forest(data=data)
# 离群点数据:去除
data = data.drop(index=iForest_index)
data = data.reset_index(drop=True)
#
X_data, y_data = data[[
"pre_time",
"outside_temper",
"outside_weather",
#"outside_humidity", # 去掉湿度
"outside_wind",
"outside_wind_speed",
"average_inside_temper",
#"water_deviation_temper"
]], data['supply_water_temper']
#
# ==============================================================================================================
# 开始训练:lightGBM
# ==============================================================================================================
train_result_lightGBM: dict = model_algorithm.huak_rf.train(
X_data=X_data, y_data=y_data
)
# 训练结果
logger.info(f"训练结果: \n {train_result_lightGBM}")
#
model_lightGBM = train_result_lightGBM.get("model")
best_param_lightGBM = train_result_lightGBM.get("best_param")
predict = train_result_lightGBM.get("predict")
y_test = train_result_lightGBM.get("y_test")
model_score_check_result_lightGBM = train_result_lightGBM.get("model_score_check_result")
# 模型名称
model_name = str(model_name)
logger.info(f"模型名称: {model_name}")
# 模型存储路径
model_store_path_01 = ModelStorePathUtil.get_model_path_by_model_name_type2(model_name=model_name)
logger.info(f"模型存储路径[model_store_path_01]: {model_store_path_01}")
# 模型存储路径
model_store_path_02 = ModelStorePathUtil.get_model_path_by_model_name_type2_with_start_end_time(
model_name=model_name, start_time=start_time, end_time=end_time
)
logger.info(f"模型存储路径[model_store_path_02]: {model_store_path_02}")
# 保存模型
pickle.dump(model_lightGBM, open(model_store_path_01, "wb"))
pickle.dump(model_lightGBM, open(model_store_path_02, "wb"))
#
Time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
#
sql = f"UPDATE sn_control_hotstation_model " \
f"SET is_created=1 ,last_model_update_time= '{Time}' " \
f"WHERE model_type=2 AND station_id ='{station_id}' "
#
# 更新语句
logger.info(f"更新语句: {sql}")
# 执行更新
result = MySQLUtils.execute(query=sql)
# 更新结果
logger.info(f"更新结果: {result}")
#
logger.info(f"模型存储完毕: {model_name}")
# 绘制预测结果和测试集拟合图
logger.info(f"绘制{model_name}结果拟合图")
fig = plt.figure(figsize=(200, 3)) # dpi参数指定绘图对象的分辨率,即每英寸多少个像素,缺省值为80
axes = fig.add_subplot(1, 1, 1)#设置查看的子图
line1, = axes.plot(range(len(predict)), predict, 'b--', label='predict', linewidth=2)
line3, = axes.plot(range(len(predict)), y_test, 'g', label='true')
axes.grid()
fig.tight_layout()
plt.legend(handles=[line1, line3])
plt.title('拟合曲线')
plt.show()
#
pass
if __name__ == '__main__':
create(model_name="政泉花园二期北区-低区", station_id="1552",
start_time="2020-11-01 00:00:00",
end_time="2020-12-01 00:00:00")
pass
拟合曲线如下