第六章 商品数据化运营
一、案例-基于Gradient Boosting的自动超参数优化的销售预测
说明
- 描述:“代码实操”以及内容延伸部分源代码
- 时间:2019-01-01
- 作者:宋天龙(Tony Song)
- 程序开发环境:win7 64位
- Python版本:64位 3.7
- 依赖库:numpy、pandas、sklearn、matplotlib
- 程序输入:products_sales.txt
- 程序输出:新数据集预测销售量
程序
导入库
import numpy as np # 导入numpy库
import pandas as pd # 导入pandas库
from sklearn.ensemble import GradientBoostingRegressor # 集成方法回归库
from sklearn.model_selection import GridSearchCV # 导入交叉检验库
import matplotlib.pyplot as plt # 导入图形展示库
from sklearn.metrics import mean_squared_error as mse
读取数据
raw_data = pd.read_table('products_sales.txt', delimiter=',')
数据审查
# 数据概览
print('{:*^60}'.format('Data overview:'),'\n',raw_data.tail(2))#打印原始数据后2条
print('{:*^60}'.format('Data dtypes:'),'\n',raw_data.dtypes)# 数据类型
***********************Data overview:***********************
limit_infor campaign_type campaign_level product_level \
729 0 6 0 1
730 0 6 0 1
resource_amount email_rate price discount_rate hour_resouces \
729 8 0.8 150.0 0.87 987
730 9 0.8 149.0 0.84 1448
campaign_fee orders
729 2298 3285
730 3392 4840
************************Data dtypes:************************
limit_infor int64
campaign_type int64
campaign_level int64
product_level int64
resource_amount int64
email_rate float64
price float64
discount_rate float64
hour_resouces int64
campaign_fee int64
orders int64
dtype: object
# 缺失值审查
na_cols = raw_data.isnull().any(axis=0) # 查看每一列是否具有缺失值
print('{:*^60}'.format('NA Cols:'))
print(na_cols[na_cols]==True) # 查看具有缺失值的列
print('Total NA lines is: {0}'.format(raw_data.isnull().any(axis=1).sum())) # 查看具有缺失值的行总记录数
**************************NA Cols:**************************
price True
dtype: bool
Total NA lines is: 2
数据预处理
# 缺失值处理
sales_data = raw_data.fillna(raw_data['price'].mean()) # 缺失值替换为均值
# 分割数据集X和y
num = int(0.7*sales_data.shape[0])
X,y = sales_data.iloc[:, :-1],sales_data.iloc[:, -1]
X_train,X_test = X.iloc[:num,:],X.iloc[num:,:]
y_train,y_test = y.iloc[:num],y.iloc[num:]
模型训练
# 模型最优化参数训练及检验
model_gbr = GradientBoostingRegressor() # 建立GradientBoostingRegressor回归对象
parameters = {'loss': ['ls', 'lad', 'huber', 'quantile'],
'n_estimators': [10, 50, 100],
'learning_rate': [0.05, 0.1, 0.15],
'max_depth': [2, 3, 4],
'min_samples_split': [2, 3, 5],
'min_samples_leaf': [1, 2, 4]} # 定义要优化的参数信息
model_gs = GridSearchCV(estimator=model_gbr,
param_grid=parameters, cv=3, n_jobs=-1) # 建立交叉检验模型对象
model_gs.fit(X_train, y_train) # 训练交叉检验模型
print('Best score is:', model_gs.best_score_) # 获得交叉检验模型得出的最优得分
print('Best parameter is:', model_gs.best_params_) # 获得交叉检验模型得出的最优参数
Best score is: 0.9551948963085048
Best parameter is: {'learning_rate': 0.15, 'loss': 'huber', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
# 获取最佳训练模型
model_best = model_gs.best_estimator_ # 获得交叉检验模型得出的最优模型对象
模型评估
# 模型交叉检验结果
# print(model_gs.cv_results_.keys())
model_gs.cv_results_.get('mean_test_score')
array([-3.04684856e-01, 8.06548246e-01, 9.14809067e-01, -3.04684856e-01,
8.06548246e-01, 9.14890055e-01, -3.04684856e-01, 8.06548246e-01,
9.13781688e-01, -3.04684856e-01, 8.06548246e-01, 9.18813782e-01,
-3.04684856e-01, 8.06548246e-01, 9.18813782e-01, -3.04684856e-01,
8.06548246e-01, 9.18771147e-01, -3.04684856e-01, 8.06548246e-01,
9.19490365e-01, -3.04684856e-01, 8.06548246e-01, 9.19490365e-01,
-3.04684856e-01, 8.06548246e-01, 9.19533014e-01, -1.59989680e-01,
8.69177329e-01, 9.12890192e-01, -1.59989680e-01, 8.69219101e-01,
9.13100917e-01, -1.59989680e-01, 8.69187304e-01, 9.11995397e-01,
-1.59989680e-01, 8.68717639e-01, 9.09639269e-01, -1.59989680e-01,
8.68761471e-01, 9.09810802e-01, -1.59989680e-01, 8.68761471e-01,
9.05874979e-01, -1.58765990e-01, 8.76222207e-01, 9.05835313e-01,
-1.58765990e-01, 8.76222207e-01, 9.05748400e-01, -1.58765990e-01,
8.76245176e-01, 9.05623430e-01, -1.27665112e-01, 8.66218509e-01,
9.01279333e-01, -1.28053601e-01, 8.70103832e-01, 9.05487514e-01,
-1.28303511e-01, 8.69013558e-01, 9.04916146e-01, -1.07331837e-01,
8.69683785e-01, 8.90446472e-01, -1.07331837e-01, 8.70114026e-01,
8.90874298e-01, -1.07251995e-01, 8.69859972e-01, 8.95114465e-01,
-9.71704659e-02, 8.79431029e-01, 9.00107643e-01, -9.71704659e-02,
8.79431029e-01, 9.00060292e-01, -9.71590741e-02, 8.79391451e-01,
9.00141243e-01, -6.22831176e-01, 6.13924733e-01, 8.32917905e-01,
-6.13343830e-01, 6.13923628e-01, 8.33406473e-01, -6.13343830e-01,
6.13923612e-01, 8.32637086e-01, -6.22651606e-01, 6.18839710e-01,
8.39929578e-01, -6.22651606e-01, 6.19876346e-01, 8.39600793e-01,
-6.22651606e-01, 6.15064444e-01, 8.41912083e-01, -6.30286744e-01,
5.89139767e-01, 8.47586021e-01, -6.30274902e-01, 5.89139889e-01,
8.43157079e-01, -6.30092690e-01, 5.64874425e-01, 8.44219083e-01,
-4.99684043e-01, 6.68634026e-01, 8.92665773e-01, -4.99785367e-01,
6.64091119e-01, 8.94663627e-01, -4.99624856e-01, 6.60662440e-01,
9.00112779e-01, -4.99434677e-01, 6.69704886e-01, 8.85456736e-01,
-4.94565392e-01, 6.72516899e-01, 8.88710292e-01, -4.97116387e-01,
6.75399505e-01, 8.96628876e-01, -5.04321359e-01, 6.90712774e-01,
9.03000713e-01, -4.90653103e-01, 6.91786937e-01, 8.84570575e-01,
-5.07794587e-01, 7.00465502e-01, 8.99583823e-01, -4.95188976e-01,
7.01711776e-01, 8.88480505e-01, -4.87161308e-01, 7.05749378e-01,
8.92726551e-01, -4.86692209e-01, 6.66856523e-01, 8.91836546e-01,
-4.76331032e-01, 7.27309804e-01, 8.89563080e-01, -4.76226083e-01,
7.01298775e-01, 8.78990819e-01, -4.78303112e-01, 6.92324457e-01,
8.85337627e-01, -4.63212374e-01, 7.24547808e-01, 9.01919072e-01,
-4.71233249e-01, 7.27248246e-01, 9.01425175e-01, -4.37810506e-01,
7.23156780e-01, 9.00850184e-01, -2.98444588e-01, 7.51592044e-01,
9.07749520e-01, -2.98444588e-01, 7.51592044e-01, 9.07669223e-01,
-2.98444588e-01, 7.51673245e-01, 9.07689653e-01, -2.98444588e-01,
7.51673245e-01, 9.06579834e-01, -2.98444588e-01, 7.51592044e-01,
9.06623736e-01, -2.98444588e-01, 7.51592044e-01, 9.06623736e-01,
-2.98444588e-01, 7.51673245e-01, 9.06621967e-01, -2.98444588e-01,
7.51592044e-01, 9.06532574e-01, -2.98444588e-01, 7.51592044e-01,
9.06578408e-01, -1.63927674e-01, 8.47343775e-01, 9.31962612e-01,
-1.63927674e-01, 8.47332087e-01, 9.30812175e-01, -1.63927674e-01,
8.47332087e-01, 9.31129440e-01, -1.63927674e-01, 8.47374465e-01,
9.34864104e-01, -1.63927674e-01, 8.47369011e-01, 9.33290581e-01,
-1.63927674e-01, 8.47312264e-01, 9.34774506e-01, -1.63894933e-01,
8.46195059e-01, 9.34041142e-01, -1.63894933e-01, 8.46195059e-01,
9.33770580e-01, -1.63894933e-01, 8.46088198e-01, 9.33825014e-01,
-6.71683608e-02, 8.63458018e-01, 9.21457860e-01, -6.85903481e-02,
8.64332610e-01, 9.17137539e-01, -6.83269829e-02, 8.64189431e-01,
9.21627261e-01, -6.37754231e-02, 8.68937198e-01, 9.30375204e-01,
-6.33264128e-02, 8.69168025e-01, 9.30183499e-01, -6.31098567e-02,
8.68671107e-01, 9.31416483e-01, -6.56223179e-02, 8.67196602e-01,
9.31355434e-01, -6.58418305e-02, 8.67547082e-01, 9.31418481e-01,
-6.56223179e-02, 8.66961687e-01, 9.30157499e-01, -4.54599034e+00,
-1.95070414e+00, -5.07890181e-01, -4.51806057e+00, -1.88608962e+00,
-5.35870719e-01, -4.54568908e+00, -1.85072899e+00, -5.53467876e-01,
-4.50965592e+00, -1.85137454e+00, -3.59865273e-01, -4.49411560e+00,
-1.70644868e+00, -5.37196720e-01, -4.50940337e+00, -1.81067978e+00,
-4.77628771e-01, -4.48194736e+00, -1.61219037e+00, -6.12472220e-01,
-4.52785541e+00, -1.67576241e+00, -5.26042485e-01, -4.50192875e+00,
-1.60912616e+00, -5.81262496e-01, -4.34062142e+00, -1.27937933e+00,
-3.39369271e-01, -4.33594100e+00, -1.40597561e+00, -2.97238924e-01,
-4.25848320e+00, -1.23696450e+00, -2.06424090e-01, -4.28557584e+00,
-1.43417293e+00, -2.87956669e-01, -4.33253340e+00, -1.08276379e+00,
-2.59256326e-01, -4.30926560e+00, -1.27536898e+00, -2.05686367e-01,
-4.23617310e+00, -1.25480163e+00, -4.14684777e-02, -4.35877822e+00,
-1.33915509e+00, -1.55675055e-01, -4.30895587e+00, -1.40556323e+00,
-6.74235490e-02, -4.29195632e+00, -1.11130600e+00, -7.80844457e-02,
-4.32886526e+00, -1.17679094e+00, -2.19942512e-01, -4.33555990e+00,
-1.14351218e+00, -7.80372608e-02, -4.33990183e+00, -1.27835169e+00,
-5.29610260e-03, -4.33013758e+00, -1.31552400e+00, 2.68724253e-02,
-4.32508078e+00, -1.28002383e+00, 2.02230962e-02, -4.30721098e+00,
-1.24879544e+00, 8.67682049e-02, -4.31665225e+00, -1.15197544e+00,
1.66059687e-02, -4.33854014e+00, -1.19695084e+00, 1.51681192e-01,
2.66704834e-01, 9.19163131e-01, 9.13614017e-01, 2.66704834e-01,
9.18069802e-01, 9.15250542e-01, 2.66704834e-01, 9.19078855e-01,
9.16165905e-01, 2.66704834e-01, 9.20765539e-01, 9.12808967e-01,
2.66704834e-01, 9.20765539e-01, 9.12598258e-01, 2.66704834e-01,
9.20765539e-01, 9.12844402e-01, 2.66704834e-01, 9.17366411e-01,
9.04987514e-01, 2.66704834e-01, 9.17366411e-01, 9.04987514e-01,
2.66704834e-01, 9.17366411e-01, 9.04987514e-01, 4.60447126e-01,
9.16110672e-01, 9.07552653e-01, 4.60447126e-01, 9.16636229e-01,
9.09158318e-01, 4.60313211e-01, 9.16975516e-01, 9.12866489e-01,
4.60447126e-01, 9.02785952e-01, 8.77898873e-01, 4.60447126e-01,
9.02785189e-01, 8.77289885e-01, 4.60313211e-01, 9.02315066e-01,
8.82812893e-01, 4.68748181e-01, 9.08579465e-01, 8.85044881e-01,
4.68614265e-01, 9.08566095e-01, 8.85178892e-01, 4.68748181e-01,
9.08504824e-01, 8.85128157e-01, 5.04777905e-01, 9.00243364e-01,
8.95760558e-01, 5.05916270e-01, 8.99165955e-01, 8.98146295e-01,
5.05065939e-01, 9.04253064e-01, 9.00894629e-01, 5.13244227e-01,
8.95198757e-01, 8.84166774e-01, 5.12568203e-01, 8.96332425e-01,
8.85858212e-01, 5.12600010e-01, 8.96799422e-01, 8.81965672e-01,
5.36998794e-01, 9.02429522e-01, 8.94358909e-01, 5.37067250e-01,
9.02525169e-01, 8.94632142e-01, 5.36816788e-01, 9.02386085e-01,
8.94342430e-01, -8.26354541e-02, 8.31904198e-01, 9.33817495e-01,
-8.26354541e-02, 8.31372889e-01, 9.33615223e-01, -8.23647742e-02,
8.32549960e-01, 9.32521720e-01, -8.26168721e-02, 8.30658123e-01,
9.31053730e-01, -8.26168721e-02, 8.27170774e-01, 9.27917274e-01,
-8.26168721e-02, 8.25956059e-01, 9.32150190e-01, -9.01977509e-02,
8.36265555e-01, 9.41705490e-01, -9.03665204e-02, 8.36301500e-01,
9.43012594e-01, -9.01977509e-02, 8.35480761e-01, 9.43531457e-01,
4.65890947e-02, 8.79762380e-01, 9.18420714e-01, 4.61663069e-02,
8.84627436e-01, 9.29719705e-01, 4.57375948e-02, 8.66033047e-01,
9.25511499e-01, 7.57226407e-02, 8.77097556e-01, 9.34909245e-01,
7.59343015e-02, 8.67062858e-01, 9.29091479e-01, 7.69899746e-02,
8.99370764e-01, 9.14294886e-01, 7.73228794e-02, 8.92281478e-01,
9.38419239e-01, 9.97836036e-02, 9.01621480e-01, 9.33991584e-01,
1.00644182e-01, 8.93148925e-01, 9.31013108e-01, 7.84364705e-02,
8.70725420e-01, 9.20134272e-01, 8.04182216e-02, 8.97617462e-01,
9.07844791e-01, 8.00431124e-02, 8.90980042e-01, 9.06736829e-01,
6.71818065e-02, 8.96947492e-01, 9.21026251e-01, 8.91255089e-02,
8.86836389e-01, 9.20698744e-01, 9.22180520e-02, 8.83999426e-01,
9.19556184e-01, 1.54408458e-01, 9.07566513e-01, 9.26873436e-01,
1.61108793e-01, 9.01377229e-01, 9.27419968e-01, 1.69377336e-01,
9.04031491e-01, 9.21138311e-01, 2.35276222e-01, 9.11678048e-01,
9.51085773e-01, 2.35276222e-01, 9.11678048e-01, 9.49279246e-01,
2.35276222e-01, 9.11678048e-01, 9.50769664e-01, 2.35276222e-01,
9.12300416e-01, 9.49597202e-01, 2.35276222e-01, 9.11788176e-01,
9.50160539e-01, 2.35276222e-01, 9.12300416e-01, 9.49468968e-01,
2.35276222e-01, 9.12026284e-01, 9.46389508e-01, 2.35276222e-01,
9.12026284e-01, 9.46389508e-01, 2.35276222e-01, 9.12026284e-01,
9.46389508e-01, 4.12778042e-01, 9.32117693e-01, 9.36200124e-01,
4.12778042e-01, 9.31544718e-01, 9.34015084e-01, 4.12778042e-01,
9.31726459e-01, 9.37698252e-01, 4.12803116e-01, 9.34442191e-01,
9.45735237e-01, 4.12803116e-01, 9.34973847e-01, 9.45086832e-01,
4.12803116e-01, 9.34512729e-01, 9.45489201e-01, 4.13263244e-01,
9.35024021e-01, 9.42502399e-01, 4.13263244e-01, 9.35103261e-01,
9.42281973e-01, 4.13263244e-01, 9.35102983e-01, 9.42495768e-01,
4.98305894e-01, 9.25526222e-01, 9.30502503e-01, 5.03891404e-01,
9.24612674e-01, 9.35460008e-01, 5.06776733e-01, 9.26339247e-01,
9.26813043e-01, 5.13767608e-01, 9.29428616e-01, 9.32916916e-01,
5.12902388e-01, 9.26115888e-01, 9.28783151e-01, 5.12574442e-01,
9.23625816e-01, 9.23678981e-01, 5.03659761e-01, 9.28908103e-01,
9.38314664e-01, 5.04200423e-01, 9.29772013e-01, 9.36376311e-01,
5.03980606e-01, 9.28688700e-01, 9.34674112e-01, -3.61756510e+00,
-5.61424650e-01, 3.08313762e-01, -3.62040520e+00, -5.93526816e-01,
3.52221877e-01, -3.62093215e+00, -5.20790333e-01, 2.99506671e-01,
-3.37033168e+00, -4.55821276e-01, 3.10331724e-01, -3.39567104e+00,
-4.51693023e-01, 2.85582647e-01, -3.37096809e+00, -4.85078201e-01,
2.72302396e-01, -3.36633290e+00, -4.74520257e-01, 1.90835544e-01,
-3.37103362e+00, -4.24125712e-01, 2.10672593e-01, -3.36677691e+00,
-4.77291995e-01, 2.03635431e-01, -3.28681054e+00, -1.13656916e-01,
7.26744649e-01, -3.23860647e+00, -6.33596417e-02, 4.60584702e-01,
-3.30345886e+00, -2.27520027e-01, 5.84617680e-01, -3.14468170e+00,
-1.04618209e-01, 5.74062354e-01, -3.34225766e+00, -2.23156977e-01,
5.04417403e-01, -3.30175535e+00, -1.57435672e-01, 3.21727387e-01,
-3.40380620e+00, -5.24890840e-03, 5.68173207e-01, -3.34765749e+00,
-8.55154256e-02, 5.29097047e-01, -3.26490714e+00, -1.91705099e-01,
4.96807023e-01, -3.27618481e+00, 3.57061899e-02, 7.95322292e-01,
-3.30518774e+00, -8.62078866e-02, 5.61558992e-01, -3.23620260e+00,
-1.10882519e-01, 5.33654951e-01, -3.39410042e+00, 8.97976389e-02,
6.37508725e-01, -3.34679657e+00, 4.30133507e-02, 6.46350106e-01,
-3.28104137e+00, 8.63690016e-02, 6.60019886e-01, -3.29014688e+00,
-3.80517880e-03, 6.69689197e-01, -3.18470312e+00, 1.50493362e-01,
6.02131807e-01, -3.31593580e+00, 1.58376043e-02, 6.13694315e-01,
5.76915845e-01, 9.18486377e-01, 9.06676249e-01, 5.76915845e-01,
9.17580444e-01, 9.07439934e-01, 5.76915845e-01, 9.19391535e-01,
9.06115328e-01, 5.76915845e-01, 9.22800037e-01, 9.00716416e-01,
5.76915845e-01, 9.23324797e-01, 8.99860809e-01, 5.76915845e-01,
9.19669462e-01, 8.99520108e-01, 5.76915845e-01, 9.14607234e-01,
8.94480992e-01, 5.76915845e-01, 9.14607234e-01, 8.94480992e-01,
5.76915845e-01, 9.14607234e-01, 8.94497181e-01, 7.37447778e-01,
9.08421583e-01, 9.03966441e-01, 7.37447778e-01, 9.11896547e-01,
9.07825853e-01, 7.37447778e-01, 9.11981759e-01, 9.06743319e-01,
7.36559501e-01, 8.87681806e-01, 8.72549534e-01, 7.36559501e-01,
8.86796128e-01, 8.75207568e-01, 7.36559501e-01, 8.93498653e-01,
8.74767684e-01, 7.33239131e-01, 9.00657908e-01, 8.75250675e-01,
7.33727917e-01, 9.00264489e-01, 8.75243886e-01, 7.33727917e-01,
9.00276039e-01, 8.75342200e-01, 7.37943440e-01, 8.99826230e-01,
9.01312120e-01, 7.38452368e-01, 9.05693433e-01, 9.03750914e-01,
7.38606122e-01, 9.00953015e-01, 8.99180765e-01, 7.47062026e-01,
8.89886875e-01, 8.89350981e-01, 7.47062026e-01, 8.89052714e-01,
8.90042461e-01, 7.46782414e-01, 8.90191764e-01, 8.88329554e-01,
7.64774216e-01, 8.92728058e-01, 8.88259711e-01, 7.64774216e-01,
8.92741189e-01, 8.88366003e-01, 7.64774216e-01, 8.92714134e-01,
8.88419416e-01, 2.01635741e-01, 9.08745928e-01, 9.44275358e-01,
2.01635741e-01, 9.11113421e-01, 9.44242724e-01, 2.01635741e-01,
9.10101356e-01, 9.43863221e-01, 2.54364487e-01, 9.01450150e-01,
9.44161400e-01, 2.54364487e-01, 9.00947244e-01, 9.45480493e-01,
2.54364487e-01, 8.99667994e-01, 9.43029779e-01, 1.90040518e-01,
9.13592856e-01, 9.46386703e-01, 1.90040518e-01, 9.12809231e-01,
9.43201209e-01, 1.90040518e-01, 9.18599807e-01, 9.44732408e-01,
4.39673958e-01, 9.16815015e-01, 9.15355308e-01, 4.40039301e-01,
9.20407567e-01, 9.38579514e-01, 4.35921891e-01, 9.19782748e-01,
9.34965950e-01, 4.36147202e-01, 9.09958587e-01, 9.36225092e-01,
4.38822799e-01, 9.16049533e-01, 9.29721655e-01, 4.33749015e-01,
9.23352441e-01, 9.35580393e-01, 4.57378140e-01, 9.25186734e-01,
9.39506693e-01, 4.72134964e-01, 9.27199180e-01, 9.42786880e-01,
4.56488722e-01, 9.34887574e-01, 9.43254753e-01, 4.15832860e-01,
8.72123287e-01, 9.06797036e-01, 4.07566835e-01, 9.13345858e-01,
8.96687558e-01, 4.01541658e-01, 9.08405865e-01, 9.12112866e-01,
4.84944484e-01, 9.17898168e-01, 9.26311536e-01, 4.64399260e-01,
9.13419508e-01, 9.22962941e-01, 4.61255450e-01, 9.07377722e-01,
9.22786838e-01, 4.95347177e-01, 9.17004714e-01, 9.30001636e-01,
4.87735743e-01, 9.30509824e-01, 9.33033417e-01, 4.55778700e-01,
9.25088020e-01, 9.34141298e-01, 5.16633286e-01, 9.47789861e-01,
9.55076763e-01, 5.16633286e-01, 9.48484215e-01, 9.55194896e-01,
5.16633286e-01, 9.49087577e-01, 9.54331810e-01, 5.16633286e-01,
9.47285712e-01, 9.53315895e-01, 5.16633286e-01, 9.47787205e-01,
9.53702611e-01, 5.16633286e-01, 9.48163883e-01, 9.52877029e-01,
5.16633286e-01, 9.45787459e-01, 9.50732757e-01, 5.16633286e-01,
9.45805117e-01, 9.50728351e-01, 5.16633286e-01, 9.45787459e-01,
9.50735975e-01, 6.63607254e-01, 9.37789960e-01, 9.26398406e-01,
6.63607254e-01, 9.37609287e-01, 9.29696320e-01, 6.63607254e-01,
9.37998715e-01, 9.38919604e-01, 6.64305136e-01, 9.40703249e-01,
9.08170366e-01, 6.64305136e-01, 9.41023961e-01, 9.40018409e-01,
6.64305136e-01, 9.42370520e-01, 9.45055840e-01, 6.72863310e-01,
9.39804623e-01, 9.42556930e-01, 6.72863310e-01, 9.40566698e-01,
9.43005238e-01, 6.72863310e-01, 9.38608668e-01, 9.43518438e-01,
7.18363522e-01, 9.29969487e-01, 9.19607495e-01, 7.22502914e-01,
9.23127885e-01, 9.17108306e-01, 7.25184499e-01, 9.27947160e-01,
9.25341478e-01, 7.26291642e-01, 9.38007033e-01, 9.32290249e-01,
7.27404272e-01, 9.32521308e-01, 9.28269751e-01, 7.27410093e-01,
9.34745061e-01, 9.19652400e-01, 7.35705842e-01, 9.35130605e-01,
9.33561347e-01, 7.36000939e-01, 9.36499106e-01, 9.33388393e-01,
7.35705842e-01, 9.34523961e-01, 9.35987334e-01, -2.98155739e+00,
-1.47487934e-02, 5.44983390e-01, -3.10012674e+00, 3.67552227e-02,
4.84982592e-01, -3.10037603e+00, 9.10236943e-02, 5.41018605e-01,
-3.03657096e+00, 1.45959816e-01, 5.82460270e-01, -3.06458156e+00,
7.45395763e-02, 5.15498538e-01, -3.08752579e+00, 1.35325946e-01,
5.41775353e-01, -2.94074184e+00, 8.19713180e-02, 4.30954160e-01,
-2.94074311e+00, 5.17273954e-02, 4.56978465e-01, -3.02204778e+00,
3.65235223e-02, 4.79634484e-01, -1.94423957e+00, 3.73279929e-01,
5.42336450e-01, -2.05981629e+00, 5.28085085e-01, 5.68070260e-01,
-2.35207814e+00, 2.94413529e-01, 6.23596374e-01, -2.30537286e+00,
2.69970162e-01, 5.31786592e-01, -2.38805459e+00, 3.96651830e-01,
5.85798195e-01, -2.21977187e+00, 2.58465191e-01, 5.18002798e-01,
-2.50672336e+00, 4.83184675e-01, 6.26679621e-01, -2.49496271e+00,
4.38702737e-01, 7.10414416e-01, -2.57814315e+00, 4.46509252e-01,
7.19086112e-01, -2.54749763e+00, 4.74511638e-01, 4.89981281e-01,
-2.49949635e+00, 4.10146721e-01, 5.23840602e-01, -2.44490232e+00,
4.53695263e-01, 6.99762928e-01, -2.21671018e+00, 5.55964252e-01,
7.02946117e-01, -2.41202384e+00, 6.02134397e-01, 6.84566301e-01,
-2.33971649e+00, 5.85083280e-01, 7.24736717e-01, -2.45479514e+00,
4.41503769e-01, 6.46174252e-01, -2.54940339e+00, 5.57071360e-01,
6.20001212e-01, -2.61696306e+00, 5.40715820e-01, 6.44323521e-01])
# 回归指标评估
pre_test = model_best.predict(X_test)
mse_score = mse(pre_test,y_test)
mse_score
287348.77319651755
from sklearn import metrics
metrics.mean_absolute_error(y_test, pre_test)
233.48776706565366
# 模型拟合程度
plt.style.use("ggplot") # 应用ggplot自带样式库
plt.figure(figsize=(10,7)) # 建立画布对象
plt.plot(np.arange(X_test.shape[0]), y_test, linestyle='-', color='k', label='true y') # 画出原始变量的曲线
plt.plot(np.arange(X_test.shape[0]), pre_test, linestyle=':', color='m',
label='predicted y') # 画出预测变量曲线
plt.title('best model with mse of {}'.format(int(mse_score)))
plt.legend(loc=0) # 设置图例位置
<matplotlib.legend.Legend at 0x20689ccfba8>
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9X4qBYGF-1608625764333)(output_23_1.png)]
新数据集预测
New_X = np.array([[1, 1, 0, 1, 15, 0.5, 177, 0.66, 101, 798]]) # 要预测的新数据记录
print('{:*^60}'.format('Predicted orders:'))
print(model_best.predict(New_X).round(0)) # 打印输出预测值
*********************Predicted orders:**********************
[846.]
二、案例-基于集成算法AdaBoost、GradientBoosting、RandomForest和Bagging的投票组合模型的异常检测
说明
- 描述:“代码实操”以及内容延伸部分源代码
- 时间:2019-01-01
- 作者:宋天龙(Tony Song)
- 程序开发环境:win7 64位
- Python版本:64位 3.7
- 依赖库:numpy、pandas、sklearn、imblearn
- 程序输入:abnormal_orders.txt
- 程序输出:预测数据直接打印输出
程序
导入库
import numpy as np
import pandas as pd # pandas库
from imblearn.over_sampling import SMOTE # 过抽样处理库SMOTE
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier, \
RandomForestClassifier # 四种集成分类库和投票方法库
from sklearn.model_selection import StratifiedKFold, cross_val_score # 导入交叉检验算法
from sklearn.preprocessing import OrdinalEncoder # 字符串转数值
函数模块
# 日期和时间拓展
def datetime_exp(data):
'''
将日期和时间数据拓展出其他属性,例如星期几、周几、小时、分钟等。
:param data: 数据集
:return: 拓展后的属性矩阵
'''
date_set = [pd.datetime.strptime(dates, '%Y-%m-%d') for dates in
data['order_date']] # 将data中的order_date列转换为特定日期格式
data['weekday_data'] = [data.weekday() for data in date_set] # 周几
data['daysinmonth_data'] = [data.day for data in date_set] # 当月几号
data['month_data'] = [data.month for data in date_set] # 月份
time_set = [pd.datetime.strptime(times, '%H:%M:%S') for times in
data['order_time']] # 将data中的order_time列转换为特定时间格式
data['second_data'] = [data.second for data in time_set] # 秒
data['minute_data'] = [data.minute for data in time_set] # 分钟
data['hour_data'] = [data.hour for data in time_set] # 小时
return data.drop(['order_date','order_time'],axis=1)
读取数据
raw_data = pd.read_table('abnormal_orders.txt', delimiter=',') # 读取数据集
数据审查
# 查看基本状态
print('{:*^60}'.format('Data overview:'))
print(raw_data.tail(2)) # 打印原始数据后2条
print('{:*^60}'.format('Data dtypes:'))
print(raw_data.dtypes) # 打印数据类型
print('{:*^60}'.format('Data DESC:'))
print(raw_data.describe().round(2).T) # 打印原始数据基本描述性信息
***********************Data overview:***********************
order_id order_date order_time cat attribution pro_id \
134188 4285770012 2013-09-19 23:55:06 家居日用 GO 1000335947
134189 4285770056 2013-05-20 23:58:59 生活电器厨卫电器 GO 1000009280
pro_brand total_money total_quantity order_source pay_type \
134188 炊大师 79.0 1 抢购 合并支付
134189 海尔 799.0 1 抢购 合并支付
user_id city abnormal_label
134188 shukun 东莞市 0
134189 544975322_ 海口市 0
************************Data dtypes:************************
order_id int64
order_date object
order_time object
cat object
attribution object
pro_id int64
pro_brand object
total_money float64
total_quantity int64
order_source object
pay_type object
user_id object
city object
abnormal_label int64
dtype: object
*************************Data DESC:*************************
count mean std min \
order_id 134190.0 4.214285e+09 1.510533e+08 3.000316e+09
pro_id 134190.0 3.404167e+09 3.287444e+09 1.000000e+09
total_money 134189.0 6.601100e+02 2.901210e+03 5.000000e-01
total_quantity 134190.0 1.200000e+00 3.230000e+00 1.000000e+00
abnormal_label 134190.0 2.100000e-01 4.100000e-01 0.000000e+00
25% 50% 75% max
order_id 4.203350e+09 4.276630e+09 4.281996e+09 4.285770e+09
pro_id 1.000321e+09 1.000369e+09 8.001623e+09 8.002352e+09
total_money 2.900000e+01 9.840000e+01 3.720000e+02 7.660000e+05
total_quantity 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+03
abnormal_label 0.000000e+00 0.000000e+00 0.000000e+00 1.000000e+00
# 缺失值审查
na_cols = raw_data.isnull().any(axis=0) # 查看每一列是否具有缺失值
print('{:*^60}'.format('NA Cols:'))
print(na_cols[na_cols==True]) # 查看具有缺失值的列
print('Total number of NA lines is: {0}'.format(
raw_data.isnull().any(axis=1).sum())) # 查看具有缺失值的行总记录数
**************************NA Cols:**************************
cat True
pro_brand True
total_money True
city True
dtype: bool
Total number of NA lines is: 1429
# 样本均衡审查
print('{:*^60}'.format('Labesl samples count:'))
print(raw_data.iloc[:, -1].value_counts())
*******************Labesl samples count:********************
0 105733
1 28457
Name: abnormal_label, dtype: int64
数据预处理
# Nan处理
drop_na_set = raw_data.dropna() # 丢弃
# 丢弃订单ID列
drop_na_set = drop_na_set.drop(['order_id'],axis=1)
# 字符串转数值
convert_cols = ['cat', 'attribution', 'pro_id', 'pro_brand', 'order_source', 'pay_type','user_id', 'city'] # 定义要转换的列
enc = OrdinalEncoder()
drop_na_set[convert_cols]=enc.fit_transform(drop_na_set[convert_cols])
# 日期特征拓展
data_final = datetime_exp(drop_na_set)
# 分割测试集和训练集X和y
num = int(0.7*data_final.shape[0])
X_raw, y_raw = data_final.drop(['abnormal_label'],axis=1), data_final['abnormal_label']
X_train,X_test = X_raw.iloc[:num,:],X_raw.iloc[num:,:]
y_train,y_test = y_raw.iloc[:num],y_raw.iloc[num:]
# 样本均衡
model_smote = SMOTE() # 建立SMOTE模型对象
x_smote_resampled, y_smote_resampled = model_smote.fit_sample(X_train, y_train) # 输入数据并作过抽样处理
模型训练
# 交叉检验
model_rf = RandomForestClassifier(max_features=0.8, random_state=0) # 随机森林分类模型对象
model_gdbc = GradientBoostingClassifier(max_features=0.8, random_state=0) # GradientBoosting分类模型对象
estimators = [('randomforest', model_rf), ('gradientboosting', model_gdbc)] # 建立组合评估器列表
model_vot = VotingClassifier(estimators=estimators, voting='soft', weights=[0.9, 1.2],
n_jobs=-1) # 建立组合评估模型
cv = StratifiedKFold(5, random_state=2) # 设置交叉检验方法
cv_score = cross_val_score(model_gdbc, x_smote_resampled, y_smote_resampled, cv=cv) # 交叉检验
print('{:*^60}'.format('Cross val scores:'),'\n',cv_score) # 打印每次交叉检验得分
print('Mean scores is: %.2f' % cv_score.mean()) # 打印平均交叉检验得分
*********************Cross val scores:**********************
[0.53665893 0.7974478 0.9446249 0.87760074 0.81158636]
Mean scores is: 0.79
# 训练模型
model_vot.fit(x_smote_resampled, y_smote_resampled) # 模型训练
VotingClassifier(estimators=[('randomforest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features=0.8, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
... subsample=1.0, tol=0.0001, validation_fraction=0.1,
verbose=0, warm_start=False))],
flatten_transform=None, n_jobs=-1, voting='soft',
weights=[0.9, 1.2])
新数据集做预测
# 读取新数据集
X_new = pd.read_csv('new_abnormal_orders.csv')
# 丢弃订单ID列
X_new_drop = X_new.drop(['order_id'], axis=1)
# 字符串转数值
X_new_drop[convert_cols] = enc.transform(X_new_drop[convert_cols])
# 日期特征拓展
X_new_final = datetime_exp(X_new_drop)
# 预测结果
predict_label = model_vot.predict(X_new_final)
predict_proba = model_vot.predict_proba(X_new_final)
predict_np = np.hstack((predict_label.reshape(-1,1),predict_proba))
predict_pd = pd.DataFrame(predict_np,columns=['lables','proba_0','proba_1'])
print('{:*^60}'.format('Predicted Labesls:'), '\n', predict_pd)
*********************Predicted Labesls:*********************
lables proba_0 proba_1
0 1.0 0.430347 0.569653
1 0.0 0.706741 0.293259
2 0.0 0.987728 0.012272
3 0.0 0.991092 0.008908
4 0.0 0.993395 0.006605
5 0.0 0.758719 0.241281
6 0.0 0.759313 0.240687