前言:接着之前数据预处理完之后,今天主要实现了对模型的初选与评估
实现步骤
- 模块导入
- 数据预处理:主要指对数据的归一化处理
- 采用cv_val_score()对所选择的模型进行评估
- 模型训练结果可视化
- 模型预测结果可视化
遇到的主要问题及解决方法
- 交叉验证评估与交叉验证选择超参数的区别:交叉验证评估与调参
- 程序参考:《python数据分析与数据运营》
- python绘图属性设置:绘图属性统一调整方法
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 4 10:49:07 2020
@author: 85845
"""
import numpy as np
import dataprocess as dp # 自己写的数据预处理的模块
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import
BayesianRidge, LinearRegression, ElasticNet
from sklearn.svm import SVR
from sklearn.ensemble.gradient_boosting
import GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import
cross_val_score
from sklearn.metrics import
explained_variance_score, mean_absolute_error, mean_squared_error, r2_score
# 评估指标介绍https://www.cnblogs.com/mdevelopment/p/9456486.html
# EV: 解释回归模型的方差得分,[0,1],接近1说明自变量越能解释因变量的方差变化
# MAE: 平均绝对误差,评估预测结果和真实数据集的接近程度的程度,越小越好
# MSE: 均方差,
计算拟合数据和原始数据对应样本点的误差的平方和的均值,越小越好
# R2: 判定系数,解释回归模型的方差得分,[0,1],接近1说明自变量越能解释因变量的方差变化。
#
=============================================================================
# 导入最大最小归一化后的数据
X_train, X_test, y_train, y_test, index_train,
index_test = dp.shuffledata('std')
n_folds = 6 # 设置交叉检验的次数
model_br = BayesianRidge() # 建立贝叶斯回归模型
model_lr = LinearRegression() # 建立普通线性回归模型
model_etc = ElasticNet() # 建立弹性网络回归模型
model_svr = SVR() # 建立支持向量回归模型
model_gbr = GradientBoostingRegressor() # 建立梯度增强回归模型
model_xgb = xgb.XGBRegressor(objective =
'reg:squarederror') # XGBoost回归模型
model_names = ['BayesianRidge',
'LinearRegression',
'ElasticNet', 'SVR', 'GBR',
'XGBR']
model_dir = [model_br, model_lr, model_etc,
model_svr, model_gbr, model_xgb]
#
=============================================================================
# 交叉验证评分与模型训练
cv_score_list = [] # 交叉检验结果列表
y_train_pre = [] # 各个模型预测的y值列表
y_test_pre = [] # 创建测试集预测结果列表
for model in model_dir:
scores = cross_val_score(model, X_train, y_train,
cv = n_folds,
scoring = 'r2')
#
对每个回归模型进行交叉验证
cv_score_list.append(scores) # 将验证结果保存在列表中
y_train_pre.append(model.fit(X_train, y_train).predict(X_train))
y_test_pre.append(model.fit(X_train, y_train).predict(X_test))
#
将训练模型的预测结果保存在列表中
#
=============================================================================
# 模型拟合与预测效果评估
n_samples, n_features = X_train.shape # 总训练样本量,总特征量
n_samples_test = X_test.shape[0]
model_metrics_name =
[explained_variance_score, mean_absolute_error,
mean_squared_error,
r2_score]
model_train_metrics = [] # 回归训练评价指标列表
model_test_metrics = [] # 回归预测评价指标列表
for i in range(len(model_dir)):
tmp_list = []
tmp2_list = []
for m in model_metrics_name:
tmp_score = m(y_train, y_train_pre[i]) # 计算每个回归指标结果
tmp_list.append(tmp_score)
tmp2_score = m(y_test, y_test_pre[i])
tmp2_list.append(tmp2_score)
model_train_metrics.append(tmp_list)
model_test_metrics.append(tmp2_list)
df1 = pd.DataFrame(cv_score_list, index =
model_names) # 建立交叉检验评分数据框
df2 = pd.DataFrame(model_train_metrics,
index = model_names,
columns = ['EV', 'MAE',
'MSE', 'R2']) # 建立回归训练评估数据框
df3 = pd.DataFrame(model_test_metrics,
index = model_names,
columns = ['EV', 'MAE',
'MSE', 'R2']) # 建立回归预测评估数据框
print('samples: %d \t features: %d'
%(n_samples, n_features))
print(70*'-') # 打印分割线
print('Cross validation result:')
print(df1)
print(70*'-')
print('Regression train metrics:')
print(df2)
print(70*'-')
print('Regression test metrics:')
print(df3)
print(70*'-')
print('short name \t full name')
print('EV \t explained_variance')
print('MAE \t mean_absolute_error')
print('MSE \t mean_squared_error')
print('R2 \t R2')
print(70*'-')
#
=============================================================================
# 模型拟合效果可视化
plt.figure(num = 'Train', figsize =
(16,12), edgecolor = 'k', frameon = True)
plot_list = {'markersize': 8, 'linewidth':
2} # 统一设置标记点大小和线宽
linestyle_list = ['r-+', 'g-o', 'b-*',
'y-^', 'c-v', 'm-x'] # 线条颜色及样式列表
plt.subplot(211)
plt.plot(np.arange(n_samples), y_train,
color = 'k', label='True y',**plot_list) # 真实标签
for i, pre_y in enumerate(y_train_pre):
plt.plot(np.arange(n_samples), pre_y,
linestyle_list[i], label = model_names[i],**plot_list)
font1 = {'family':'Time New Roman',
'weight':'normal', 'size':24}
font2 = {'family':'Time New Roman',
'weight':'normal', 'size':12}
plt.title('Regression result comparsion',
font1)
plt.legend(loc = 'upper right', prop =
font2)
plt.xlabel('Number', font1)
plt.ylabel('Real and prediction values
(μm)', font1)
#设置坐标刻度值的大小
plt.tick_params(labelsize=23)
plt.grid()
# 生成网格
#
=============================================================================
# 模型评估可视化
absolute_deviation_train=[] # 创建绝对偏差列表
plt.subplot(212)
for i, pre_y in enumerate(y_train_pre):
absolute_deviation_train.append(abs(y_train_pre[i]-y_train))
plt.plot(np.arange(n_samples),absolute_deviation_train[i],
linestyle_list[i], label = model_names[i], **plot_list)
font1 = {'family':'Time New Roman',
'weight':'normal', 'size':24}
font2 = {'family':'Time New Roman',
'weight':'normal', 'size':12}
plt.legend(loc = 'upper right', prop =
font2)
plt.xlabel('Number', font1)
plt.ylabel('Absolute deviation (μm)',
font1)
plt.tick_params(labelsize=23)
plt.grid()
# 生成网格
#
=============================================================================
# 模型预测评估与可视化
plt.figure(num = 'Prediction', figsize =
(16,12), edgecolor = 'k', frameon = True)
plt.subplot(211)
plt.plot(np.arange(n_samples_test), y_test,
color = 'k', label='True y', **plot_list) # 真实标签
for i, pre_y in enumerate(y_test_pre):
plt.plot(np.arange(n_samples_test), pre_y,
linestyle_list[i], label = model_names[i],**plot_list)
font1 = {'family':'Time New Roman',
'weight':'normal', 'size':24}
font2 = {'family':'Time New Roman',
'weight':'normal', 'size':12}
plt.title('Regression result comparsion',
font1)
plt.legend(loc = 'upper right', prop =
font2)
plt.xlabel('Number', font1)
plt.ylabel('Real and prediction values
(μm)', font1)
#设置坐标刻度值的大小
plt.tick_params(labelsize=23)
plt.grid()
# 生成网格
absolute_deviation_test=[] # 创建绝对偏差列表
plt.subplot(212)
for i, pre_y in enumerate(y_test_pre):
absolute_deviation_test.append(abs(y_test_pre[i]-y_test))
plt.plot(np.arange(n_samples_test),absolute_deviation_test[i],
linestyle_list[i], label = model_names[i], **plot_list)
font1 = {'family':'Time New Roman',
'weight':'normal', 'size':24}
font2 = {'family':'Time New Roman',
'weight':'normal', 'size':12}
plt.legend(loc = 'upper right', prop =
font2)
plt.xlabel('Number', font1)
plt.ylabel('Absolute deviation (μm)',
font1)
plt.tick_params(labelsize=23)
运行结果
samples: 122 features: 3
----------------------------------------------------------------------
Cross validation result:
0 1 2 3 4 5
BayesianRidge 0.711268 0.599770 0.745132 0.712472 0.628867 0.755151
LinearRegression 0.710164 0.592171 0.745570 0.712059 0.632022 0.756237
ElasticNet 0.145011 -0.064107 0.153849 0.105554 0.024965 0.146332
SVR 0.652658 0.631152 0.655725 0.564803 0.449425 0.676986
GBR 0.761767 0.368675 0.502797 0.783080 0.724101 0.615538
XGBR 0.754678 0.334815 0.530806 0.772734 0.809617 0.614672
----------------------------------------------------------------------
Regression train metrics:
EV MAE MSE R2
BayesianRidge 0.717568 2.326539 8.616873 0.717568
LinearRegression 0.717639 2.333652 8.614706 0.717639
ElasticNet 0.139904 3.963014 26.241160 0.139904
SVR 0.680049 2.118396 10.329849 0.661423
GBR 0.927503 1.178495 2.211848 0.927503
XGBR 0.928818 1.159955 2.171747 0.928817
----------------------------------------------------------------------
Regression test metrics:
EV MAE MSE R2
BayesianRidge 0.675782 2.419355 9.621036 0.673262
LinearRegression 0.673665 2.435601 9.688636 0.670967
ElasticNet 0.142080 3.775813 25.602307 0.130527
SVR 0.631979 2.478024 11.228063 0.618687
GBR 0.626700 2.382516 11.032982 0.625312
XGBR 0.667738 2.305168 9.859156 0.665176
----------------------------------------------------------------------
short name full name
EV explained_variance
MAE mean_absolute_error
MSE mean_squared_error
R2 R2
----------------------------------------------------------------------
结果分析
根据运行结果不难发现,BayesianRidge(贝叶斯线性回归),GBR(梯度提升回归),XGBR(XGBoost)三个模型从交叉验证评分(分数均较高且稳定),还是从预测效果评分(线性回归评估的四个指标)上看效果最好。
需要注意的是:未对模型进行调参,因此预测精度还有待进一步提升。