案例-基于Gradient Boosting的自动超参数优化的销售预测

最新推荐文章于 2024-08-16 22:44:00 发布

小天资源

最新推荐文章于 2024-08-16 22:44:00 发布

阅读量862

点赞数 1

分类专栏：数据化运营 Python 数据分析

本文链接：https://blog.csdn.net/qq_42169061/article/details/104862025

版权

Python 同时被 3 个专栏收录

53 篇文章 13 订阅

订阅专栏

数据分析

44 篇文章 4 订阅

订阅专栏

数据化运营

19 篇文章 1 订阅

订阅专栏

# 导入库

import matplotlib.pyplot as plt  # 导入图形展示库
import numpy as np  # 导入numpy库
import pandas as pd  # 导入pandas库
from sklearn.ensemble import GradientBoostingRegressor  # 集成方法回归库
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import GridSearchCV  # 导入交叉检验库


# 读取数据

raw_data = pd.read_table('products_sales.txt', delimiter=',')

# 数据审查

# 数据概览
print('{:*^60}'.format('Data overview:'), '\n', raw_data.tail(2))  # 打印原始数据后2条
print('{:*^60}'.format('Data dtypes:'), '\n', raw_data.dtypes)  # 数据类型

# 缺失值审查
na_cols = raw_data.isnull().any(axis=0)  # 查看每一列是否具有缺失值
print('{:*^60}'.format('NA Cols:'))
print(na_cols[na_cols] == True)  # 查看具有缺失值的列
print('Total NA lines is: {0}'.format(raw_data.isnull().any(axis=1).sum()))  # 查看具有缺失值的行总记录数

# 数据预处理

# 缺失值处理
sales_data = raw_data.fillna(raw_data['price'].mean())  # 缺失值替换为均值

# 分割数据集X和y
num = int(0.7 * sales_data.shape[0])
X, y = sales_data.iloc[:, :-1], sales_data.iloc[:, -1]
X_train, X_test = X.iloc[:num, :], X.iloc[num:, :]
y_train, y_test = y.iloc[:num], y.iloc[num:]

# 模型训练

# 模型最优化参数训练及检验
model_gbr = GradientBoostingRegressor()  # 建立GradientBoostingRegressor回归对象
parameters = {'loss': ['ls', 'lad', 'huber', 'quantile'],
              'n_estimators': [10, 50, 100],
              'learning_rate': [0.05, 0.1, 0.15],
              'max_depth': [2, 3, 4],
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1, 2, 4]}  # 定义要优化的参数信息
model_gs = GridSearchCV(estimator=model_gbr,
                        param_grid=parameters, cv=3, n_jobs=-1)  # 建立交叉检验模型对象
model_gs.fit(X_train, y_train)  # 训练交叉检验模型
print('Best score is:', model_gs.best_score_)  # 获得交叉检验模型得出的最优得分
print('Best parameter is:', model_gs.best_params_)  # 获得交叉检验模型得出的最优参数

# 获取最佳训练模型
model_best = model_gs.best_estimator_  # 获得交叉检验模型得出的最优模型对象

# 模型评估

# 模型交叉检验结果
# print(model_gs.cv_results_.keys())
model_gs.cv_results_.get('mean_test_score')

# 回归指标评估
pre_test = model_best.predict(X_test)
mse_score = mse(pre_test, y_test)

# 模型拟合程度
plt.style.use("ggplot")  # 应用ggplot自带样式库
plt.figure(figsize=(10, 7))  # 建立画布对象
plt.plot(np.arange(X_test.shape[0]), y_test, linestyle='-', color='k', label='true y')  # 画出原始变量的曲线
plt.plot(np.arange(X_test.shape[0]), pre_test, linestyle=':', color='m',
         label='predicted y')  # 画出预测变量曲线
plt.title('best model with mse of {}'.format(int(mse_score)))
plt.legend(loc=0)  # 设置图例位置
plt.show()

# 新数据集预测

New_X = np.array([[1, 1, 0, 1, 15, 0.5, 177, 0.66, 101, 798]])  # 要预测的新数据记录
print('{:*^60}'.format('Predicted orders:'))
print(model_best.predict(New_X).round(0))  # 打印输出预测值