数据化运营chapter6_code--商品运营案例


第六章  商品数据化运营

一、案例-基于Gradient Boosting的自动超参数优化的销售预测

说明

  • 描述:“代码实操”以及内容延伸部分源代码
  • 时间:2019-01-01
  • 作者:宋天龙(Tony Song)
  • 程序开发环境:win7 64位
  • Python版本:64位 3.7
  • 依赖库:numpy、pandas、sklearn、matplotlib
  • 程序输入:products_sales.txt
  • 程序输出:新数据集预测销售量

程序

导入库

import numpy as np  # 导入numpy库
import pandas as pd  # 导入pandas库
from sklearn.ensemble import GradientBoostingRegressor  # 集成方法回归库
from sklearn.model_selection import GridSearchCV  # 导入交叉检验库
import matplotlib.pyplot as plt  # 导入图形展示库
from sklearn.metrics import mean_squared_error as mse

读取数据

raw_data = pd.read_table('products_sales.txt', delimiter=',')

数据审查

# 数据概览
print('{:*^60}'.format('Data overview:'),'\n',raw_data.tail(2))#打印原始数据后2条
print('{:*^60}'.format('Data dtypes:'),'\n',raw_data.dtypes)# 数据类型
***********************Data overview:*********************** 
      limit_infor  campaign_type  campaign_level  product_level  \
729            0              6               0              1   
730            0              6               0              1   

     resource_amount  email_rate  price  discount_rate  hour_resouces  \
729                8         0.8  150.0           0.87            987   
730                9         0.8  149.0           0.84           1448   

     campaign_fee  orders  
729          2298    3285  
730          3392    4840  
************************Data dtypes:************************ 
 limit_infor          int64
campaign_type        int64
campaign_level       int64
product_level        int64
resource_amount      int64
email_rate         float64
price              float64
discount_rate      float64
hour_resouces        int64
campaign_fee         int64
orders               int64
dtype: object
# 缺失值审查
na_cols = raw_data.isnull().any(axis=0)  # 查看每一列是否具有缺失值
print('{:*^60}'.format('NA Cols:'))
print(na_cols[na_cols]==True)  # 查看具有缺失值的列
print('Total NA lines is: {0}'.format(raw_data.isnull().any(axis=1).sum()))  # 查看具有缺失值的行总记录数
**************************NA Cols:**************************
price    True
dtype: bool
Total NA lines is: 2

数据预处理

# 缺失值处理
sales_data = raw_data.fillna(raw_data['price'].mean())  # 缺失值替换为均值
# 分割数据集X和y
num = int(0.7*sales_data.shape[0])
X,y = sales_data.iloc[:, :-1],sales_data.iloc[:, -1]
X_train,X_test = X.iloc[:num,:],X.iloc[num:,:]
y_train,y_test = y.iloc[:num],y.iloc[num:]

模型训练

# 模型最优化参数训练及检验
model_gbr = GradientBoostingRegressor()  # 建立GradientBoostingRegressor回归对象
parameters = {'loss': ['ls', 'lad', 'huber', 'quantile'],
              'n_estimators': [10, 50, 100],
              'learning_rate': [0.05, 0.1, 0.15],
              'max_depth': [2, 3, 4],
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1, 2, 4]}  # 定义要优化的参数信息
model_gs = GridSearchCV(estimator=model_gbr,
                        param_grid=parameters, cv=3, n_jobs=-1)  # 建立交叉检验模型对象
model_gs.fit(X_train, y_train)  # 训练交叉检验模型
print('Best score is:', model_gs.best_score_)  # 获得交叉检验模型得出的最优得分
print('Best parameter is:', model_gs.best_params_)  # 获得交叉检验模型得出的最优参数
Best score is: 0.9551948963085048
Best parameter is: {'learning_rate': 0.15, 'loss': 'huber', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 100}
# 获取最佳训练模型
model_best = model_gs.best_estimator_  # 获得交叉检验模型得出的最优模型对象

模型评估

# 模型交叉检验结果
# print(model_gs.cv_results_.keys())
model_gs.cv_results_.get('mean_test_score')
array([-3.04684856e-01,  8.06548246e-01,  9.14809067e-01, -3.04684856e-01,
        8.06548246e-01,  9.14890055e-01, -3.04684856e-01,  8.06548246e-01,
        9.13781688e-01, -3.04684856e-01,  8.06548246e-01,  9.18813782e-01,
       -3.04684856e-01,  8.06548246e-01,  9.18813782e-01, -3.04684856e-01,
        8.06548246e-01,  9.18771147e-01, -3.04684856e-01,  8.06548246e-01,
        9.19490365e-01, -3.04684856e-01,  8.06548246e-01,  9.19490365e-01,
       -3.04684856e-01,  8.06548246e-01,  9.19533014e-01, -1.59989680e-01,
        8.69177329e-01,  9.12890192e-01, -1.59989680e-01,  8.69219101e-01,
        9.13100917e-01, -1.59989680e-01,  8.69187304e-01,  9.11995397e-01,
       -1.59989680e-01,  8.68717639e-01,  9.09639269e-01, -1.59989680e-01,
        8.68761471e-01,  9.09810802e-01, -1.59989680e-01,  8.68761471e-01,
        9.05874979e-01, -1.58765990e-01,  8.76222207e-01,  9.05835313e-01,
       -1.58765990e-01,  8.76222207e-01,  9.05748400e-01, -1.58765990e-01,
        8.76245176e-01,  9.05623430e-01, -1.27665112e-01,  8.66218509e-01,
        9.01279333e-01, -1.28053601e-01,  8.70103832e-01,  9.05487514e-01,
       -1.28303511e-01,  8.69013558e-01,  9.04916146e-01, -1.07331837e-01,
        8.69683785e-01,  8.90446472e-01, -1.07331837e-01,  8.70114026e-01,
        8.90874298e-01, -1.07251995e-01,  8.69859972e-01,  8.95114465e-01,
       -9.71704659e-02,  8.79431029e-01,  9.00107643e-01, -9.71704659e-02,
        8.79431029e-01,  9.00060292e-01, -9.71590741e-02,  8.79391451e-01,
        9.00141243e-01, -6.22831176e-01,  6.13924733e-01,  8.32917905e-01,
       -6.13343830e-01,  6.13923628e-01,  8.33406473e-01, -6.13343830e-01,
        6.13923612e-01,  8.32637086e-01, -6.22651606e-01,  6.18839710e-01,
        8.39929578e-01, -6.22651606e-01,  6.19876346e-01,  8.39600793e-01,
       -6.22651606e-01,  6.15064444e-01,  8.41912083e-01, -6.30286744e-01,
        5.89139767e-01,  8.47586021e-01, -6.30274902e-01,  5.89139889e-01,
        8.43157079e-01, -6.30092690e-01,  5.64874425e-01,  8.44219083e-01,
       -4.99684043e-01,  6.68634026e-01,  8.92665773e-01, -4.99785367e-01,
        6.64091119e-01,  8.94663627e-01, -4.99624856e-01,  6.60662440e-01,
        9.00112779e-01, -4.99434677e-01,  6.69704886e-01,  8.85456736e-01,
       -4.94565392e-01,  6.72516899e-01,  8.88710292e-01, -4.97116387e-01,
        6.75399505e-01,  8.96628876e-01, -5.04321359e-01,  6.90712774e-01,
        9.03000713e-01, -4.90653103e-01,  6.91786937e-01,  8.84570575e-01,
       -5.07794587e-01,  7.00465502e-01,  8.99583823e-01, -4.95188976e-01,
        7.01711776e-01,  8.88480505e-01, -4.87161308e-01,  7.05749378e-01,
        8.92726551e-01, -4.86692209e-01,  6.66856523e-01,  8.91836546e-01,
       -4.76331032e-01,  7.27309804e-01,  8.89563080e-01, -4.76226083e-01,
        7.01298775e-01,  8.78990819e-01, -4.78303112e-01,  6.92324457e-01,
        8.85337627e-01, -4.63212374e-01,  7.24547808e-01,  9.01919072e-01,
       -4.71233249e-01,  7.27248246e-01,  9.01425175e-01, -4.37810506e-01,
        7.23156780e-01,  9.00850184e-01, -2.98444588e-01,  7.51592044e-01,
        9.07749520e-01, -2.98444588e-01,  7.51592044e-01,  9.07669223e-01,
       -2.98444588e-01,  7.51673245e-01,  9.07689653e-01, -2.98444588e-01,
        7.51673245e-01,  9.06579834e-01, -2.98444588e-01,  7.51592044e-01,
        9.06623736e-01, -2.98444588e-01,  7.51592044e-01,  9.06623736e-01,
       -2.98444588e-01,  7.51673245e-01,  9.06621967e-01, -2.98444588e-01,
        7.51592044e-01,  9.06532574e-01, -2.98444588e-01,  7.51592044e-01,
        9.06578408e-01, -1.63927674e-01,  8.47343775e-01,  9.31962612e-01,
       -1.63927674e-01,  8.47332087e-01,  9.30812175e-01, -1.63927674e-01,
        8.47332087e-01,  9.31129440e-01, -1.63927674e-01,  8.47374465e-01,
        9.34864104e-01, -1.63927674e-01,  8.47369011e-01,  9.33290581e-01,
       -1.63927674e-01,  8.47312264e-01,  9.34774506e-01, -1.63894933e-01,
        8.46195059e-01,  9.34041142e-01, -1.63894933e-01,  8.46195059e-01,
        9.33770580e-01, -1.63894933e-01,  8.46088198e-01,  9.33825014e-01,
       -6.71683608e-02,  8.63458018e-01,  9.21457860e-01, -6.85903481e-02,
        8.64332610e-01,  9.17137539e-01, -6.83269829e-02,  8.64189431e-01,
        9.21627261e-01, -6.37754231e-02,  8.68937198e-01,  9.30375204e-01,
       -6.33264128e-02,  8.69168025e-01,  9.30183499e-01, -6.31098567e-02,
        8.68671107e-01,  9.31416483e-01, -6.56223179e-02,  8.67196602e-01,
        9.31355434e-01, -6.58418305e-02,  8.67547082e-01,  9.31418481e-01,
       -6.56223179e-02,  8.66961687e-01,  9.30157499e-01, -4.54599034e+00,
       -1.95070414e+00, -5.07890181e-01, -4.51806057e+00, -1.88608962e+00,
       -5.35870719e-01, -4.54568908e+00, -1.85072899e+00, -5.53467876e-01,
       -4.50965592e+00, -1.85137454e+00, -3.59865273e-01, -4.49411560e+00,
       -1.70644868e+00, -5.37196720e-01, -4.50940337e+00, -1.81067978e+00,
       -4.77628771e-01, -4.48194736e+00, -1.61219037e+00, -6.12472220e-01,
       -4.52785541e+00, -1.67576241e+00, -5.26042485e-01, -4.50192875e+00,
       -1.60912616e+00, -5.81262496e-01, -4.34062142e+00, -1.27937933e+00,
       -3.39369271e-01, -4.33594100e+00, -1.40597561e+00, -2.97238924e-01,
       -4.25848320e+00, -1.23696450e+00, -2.06424090e-01, -4.28557584e+00,
       -1.43417293e+00, -2.87956669e-01, -4.33253340e+00, -1.08276379e+00,
       -2.59256326e-01, -4.30926560e+00, -1.27536898e+00, -2.05686367e-01,
       -4.23617310e+00, -1.25480163e+00, -4.14684777e-02, -4.35877822e+00,
       -1.33915509e+00, -1.55675055e-01, -4.30895587e+00, -1.40556323e+00,
       -6.74235490e-02, -4.29195632e+00, -1.11130600e+00, -7.80844457e-02,
       -4.32886526e+00, -1.17679094e+00, -2.19942512e-01, -4.33555990e+00,
       -1.14351218e+00, -7.80372608e-02, -4.33990183e+00, -1.27835169e+00,
       -5.29610260e-03, -4.33013758e+00, -1.31552400e+00,  2.68724253e-02,
       -4.32508078e+00, -1.28002383e+00,  2.02230962e-02, -4.30721098e+00,
       -1.24879544e+00,  8.67682049e-02, -4.31665225e+00, -1.15197544e+00,
        1.66059687e-02, -4.33854014e+00, -1.19695084e+00,  1.51681192e-01,
        2.66704834e-01,  9.19163131e-01,  9.13614017e-01,  2.66704834e-01,
        9.18069802e-01,  9.15250542e-01,  2.66704834e-01,  9.19078855e-01,
        9.16165905e-01,  2.66704834e-01,  9.20765539e-01,  9.12808967e-01,
        2.66704834e-01,  9.20765539e-01,  9.12598258e-01,  2.66704834e-01,
        9.20765539e-01,  9.12844402e-01,  2.66704834e-01,  9.17366411e-01,
        9.04987514e-01,  2.66704834e-01,  9.17366411e-01,  9.04987514e-01,
        2.66704834e-01,  9.17366411e-01,  9.04987514e-01,  4.60447126e-01,
        9.16110672e-01,  9.07552653e-01,  4.60447126e-01,  9.16636229e-01,
        9.09158318e-01,  4.60313211e-01,  9.16975516e-01,  9.12866489e-01,
        4.60447126e-01,  9.02785952e-01,  8.77898873e-01,  4.60447126e-01,
        9.02785189e-01,  8.77289885e-01,  4.60313211e-01,  9.02315066e-01,
        8.82812893e-01,  4.68748181e-01,  9.08579465e-01,  8.85044881e-01,
        4.68614265e-01,  9.08566095e-01,  8.85178892e-01,  4.68748181e-01,
        9.08504824e-01,  8.85128157e-01,  5.04777905e-01,  9.00243364e-01,
        8.95760558e-01,  5.05916270e-01,  8.99165955e-01,  8.98146295e-01,
        5.05065939e-01,  9.04253064e-01,  9.00894629e-01,  5.13244227e-01,
        8.95198757e-01,  8.84166774e-01,  5.12568203e-01,  8.96332425e-01,
        8.85858212e-01,  5.12600010e-01,  8.96799422e-01,  8.81965672e-01,
        5.36998794e-01,  9.02429522e-01,  8.94358909e-01,  5.37067250e-01,
        9.02525169e-01,  8.94632142e-01,  5.36816788e-01,  9.02386085e-01,
        8.94342430e-01, -8.26354541e-02,  8.31904198e-01,  9.33817495e-01,
       -8.26354541e-02,  8.31372889e-01,  9.33615223e-01, -8.23647742e-02,
        8.32549960e-01,  9.32521720e-01, -8.26168721e-02,  8.30658123e-01,
        9.31053730e-01, -8.26168721e-02,  8.27170774e-01,  9.27917274e-01,
       -8.26168721e-02,  8.25956059e-01,  9.32150190e-01, -9.01977509e-02,
        8.36265555e-01,  9.41705490e-01, -9.03665204e-02,  8.36301500e-01,
        9.43012594e-01, -9.01977509e-02,  8.35480761e-01,  9.43531457e-01,
        4.65890947e-02,  8.79762380e-01,  9.18420714e-01,  4.61663069e-02,
        8.84627436e-01,  9.29719705e-01,  4.57375948e-02,  8.66033047e-01,
        9.25511499e-01,  7.57226407e-02,  8.77097556e-01,  9.34909245e-01,
        7.59343015e-02,  8.67062858e-01,  9.29091479e-01,  7.69899746e-02,
        8.99370764e-01,  9.14294886e-01,  7.73228794e-02,  8.92281478e-01,
        9.38419239e-01,  9.97836036e-02,  9.01621480e-01,  9.33991584e-01,
        1.00644182e-01,  8.93148925e-01,  9.31013108e-01,  7.84364705e-02,
        8.70725420e-01,  9.20134272e-01,  8.04182216e-02,  8.97617462e-01,
        9.07844791e-01,  8.00431124e-02,  8.90980042e-01,  9.06736829e-01,
        6.71818065e-02,  8.96947492e-01,  9.21026251e-01,  8.91255089e-02,
        8.86836389e-01,  9.20698744e-01,  9.22180520e-02,  8.83999426e-01,
        9.19556184e-01,  1.54408458e-01,  9.07566513e-01,  9.26873436e-01,
        1.61108793e-01,  9.01377229e-01,  9.27419968e-01,  1.69377336e-01,
        9.04031491e-01,  9.21138311e-01,  2.35276222e-01,  9.11678048e-01,
        9.51085773e-01,  2.35276222e-01,  9.11678048e-01,  9.49279246e-01,
        2.35276222e-01,  9.11678048e-01,  9.50769664e-01,  2.35276222e-01,
        9.12300416e-01,  9.49597202e-01,  2.35276222e-01,  9.11788176e-01,
        9.50160539e-01,  2.35276222e-01,  9.12300416e-01,  9.49468968e-01,
        2.35276222e-01,  9.12026284e-01,  9.46389508e-01,  2.35276222e-01,
        9.12026284e-01,  9.46389508e-01,  2.35276222e-01,  9.12026284e-01,
        9.46389508e-01,  4.12778042e-01,  9.32117693e-01,  9.36200124e-01,
        4.12778042e-01,  9.31544718e-01,  9.34015084e-01,  4.12778042e-01,
        9.31726459e-01,  9.37698252e-01,  4.12803116e-01,  9.34442191e-01,
        9.45735237e-01,  4.12803116e-01,  9.34973847e-01,  9.45086832e-01,
        4.12803116e-01,  9.34512729e-01,  9.45489201e-01,  4.13263244e-01,
        9.35024021e-01,  9.42502399e-01,  4.13263244e-01,  9.35103261e-01,
        9.42281973e-01,  4.13263244e-01,  9.35102983e-01,  9.42495768e-01,
        4.98305894e-01,  9.25526222e-01,  9.30502503e-01,  5.03891404e-01,
        9.24612674e-01,  9.35460008e-01,  5.06776733e-01,  9.26339247e-01,
        9.26813043e-01,  5.13767608e-01,  9.29428616e-01,  9.32916916e-01,
        5.12902388e-01,  9.26115888e-01,  9.28783151e-01,  5.12574442e-01,
        9.23625816e-01,  9.23678981e-01,  5.03659761e-01,  9.28908103e-01,
        9.38314664e-01,  5.04200423e-01,  9.29772013e-01,  9.36376311e-01,
        5.03980606e-01,  9.28688700e-01,  9.34674112e-01, -3.61756510e+00,
       -5.61424650e-01,  3.08313762e-01, -3.62040520e+00, -5.93526816e-01,
        3.52221877e-01, -3.62093215e+00, -5.20790333e-01,  2.99506671e-01,
       -3.37033168e+00, -4.55821276e-01,  3.10331724e-01, -3.39567104e+00,
       -4.51693023e-01,  2.85582647e-01, -3.37096809e+00, -4.85078201e-01,
        2.72302396e-01, -3.36633290e+00, -4.74520257e-01,  1.90835544e-01,
       -3.37103362e+00, -4.24125712e-01,  2.10672593e-01, -3.36677691e+00,
       -4.77291995e-01,  2.03635431e-01, -3.28681054e+00, -1.13656916e-01,
        7.26744649e-01, -3.23860647e+00, -6.33596417e-02,  4.60584702e-01,
       -3.30345886e+00, -2.27520027e-01,  5.84617680e-01, -3.14468170e+00,
       -1.04618209e-01,  5.74062354e-01, -3.34225766e+00, -2.23156977e-01,
        5.04417403e-01, -3.30175535e+00, -1.57435672e-01,  3.21727387e-01,
       -3.40380620e+00, -5.24890840e-03,  5.68173207e-01, -3.34765749e+00,
       -8.55154256e-02,  5.29097047e-01, -3.26490714e+00, -1.91705099e-01,
        4.96807023e-01, -3.27618481e+00,  3.57061899e-02,  7.95322292e-01,
       -3.30518774e+00, -8.62078866e-02,  5.61558992e-01, -3.23620260e+00,
       -1.10882519e-01,  5.33654951e-01, -3.39410042e+00,  8.97976389e-02,
        6.37508725e-01, -3.34679657e+00,  4.30133507e-02,  6.46350106e-01,
       -3.28104137e+00,  8.63690016e-02,  6.60019886e-01, -3.29014688e+00,
       -3.80517880e-03,  6.69689197e-01, -3.18470312e+00,  1.50493362e-01,
        6.02131807e-01, -3.31593580e+00,  1.58376043e-02,  6.13694315e-01,
        5.76915845e-01,  9.18486377e-01,  9.06676249e-01,  5.76915845e-01,
        9.17580444e-01,  9.07439934e-01,  5.76915845e-01,  9.19391535e-01,
        9.06115328e-01,  5.76915845e-01,  9.22800037e-01,  9.00716416e-01,
        5.76915845e-01,  9.23324797e-01,  8.99860809e-01,  5.76915845e-01,
        9.19669462e-01,  8.99520108e-01,  5.76915845e-01,  9.14607234e-01,
        8.94480992e-01,  5.76915845e-01,  9.14607234e-01,  8.94480992e-01,
        5.76915845e-01,  9.14607234e-01,  8.94497181e-01,  7.37447778e-01,
        9.08421583e-01,  9.03966441e-01,  7.37447778e-01,  9.11896547e-01,
        9.07825853e-01,  7.37447778e-01,  9.11981759e-01,  9.06743319e-01,
        7.36559501e-01,  8.87681806e-01,  8.72549534e-01,  7.36559501e-01,
        8.86796128e-01,  8.75207568e-01,  7.36559501e-01,  8.93498653e-01,
        8.74767684e-01,  7.33239131e-01,  9.00657908e-01,  8.75250675e-01,
        7.33727917e-01,  9.00264489e-01,  8.75243886e-01,  7.33727917e-01,
        9.00276039e-01,  8.75342200e-01,  7.37943440e-01,  8.99826230e-01,
        9.01312120e-01,  7.38452368e-01,  9.05693433e-01,  9.03750914e-01,
        7.38606122e-01,  9.00953015e-01,  8.99180765e-01,  7.47062026e-01,
        8.89886875e-01,  8.89350981e-01,  7.47062026e-01,  8.89052714e-01,
        8.90042461e-01,  7.46782414e-01,  8.90191764e-01,  8.88329554e-01,
        7.64774216e-01,  8.92728058e-01,  8.88259711e-01,  7.64774216e-01,
        8.92741189e-01,  8.88366003e-01,  7.64774216e-01,  8.92714134e-01,
        8.88419416e-01,  2.01635741e-01,  9.08745928e-01,  9.44275358e-01,
        2.01635741e-01,  9.11113421e-01,  9.44242724e-01,  2.01635741e-01,
        9.10101356e-01,  9.43863221e-01,  2.54364487e-01,  9.01450150e-01,
        9.44161400e-01,  2.54364487e-01,  9.00947244e-01,  9.45480493e-01,
        2.54364487e-01,  8.99667994e-01,  9.43029779e-01,  1.90040518e-01,
        9.13592856e-01,  9.46386703e-01,  1.90040518e-01,  9.12809231e-01,
        9.43201209e-01,  1.90040518e-01,  9.18599807e-01,  9.44732408e-01,
        4.39673958e-01,  9.16815015e-01,  9.15355308e-01,  4.40039301e-01,
        9.20407567e-01,  9.38579514e-01,  4.35921891e-01,  9.19782748e-01,
        9.34965950e-01,  4.36147202e-01,  9.09958587e-01,  9.36225092e-01,
        4.38822799e-01,  9.16049533e-01,  9.29721655e-01,  4.33749015e-01,
        9.23352441e-01,  9.35580393e-01,  4.57378140e-01,  9.25186734e-01,
        9.39506693e-01,  4.72134964e-01,  9.27199180e-01,  9.42786880e-01,
        4.56488722e-01,  9.34887574e-01,  9.43254753e-01,  4.15832860e-01,
        8.72123287e-01,  9.06797036e-01,  4.07566835e-01,  9.13345858e-01,
        8.96687558e-01,  4.01541658e-01,  9.08405865e-01,  9.12112866e-01,
        4.84944484e-01,  9.17898168e-01,  9.26311536e-01,  4.64399260e-01,
        9.13419508e-01,  9.22962941e-01,  4.61255450e-01,  9.07377722e-01,
        9.22786838e-01,  4.95347177e-01,  9.17004714e-01,  9.30001636e-01,
        4.87735743e-01,  9.30509824e-01,  9.33033417e-01,  4.55778700e-01,
        9.25088020e-01,  9.34141298e-01,  5.16633286e-01,  9.47789861e-01,
        9.55076763e-01,  5.16633286e-01,  9.48484215e-01,  9.55194896e-01,
        5.16633286e-01,  9.49087577e-01,  9.54331810e-01,  5.16633286e-01,
        9.47285712e-01,  9.53315895e-01,  5.16633286e-01,  9.47787205e-01,
        9.53702611e-01,  5.16633286e-01,  9.48163883e-01,  9.52877029e-01,
        5.16633286e-01,  9.45787459e-01,  9.50732757e-01,  5.16633286e-01,
        9.45805117e-01,  9.50728351e-01,  5.16633286e-01,  9.45787459e-01,
        9.50735975e-01,  6.63607254e-01,  9.37789960e-01,  9.26398406e-01,
        6.63607254e-01,  9.37609287e-01,  9.29696320e-01,  6.63607254e-01,
        9.37998715e-01,  9.38919604e-01,  6.64305136e-01,  9.40703249e-01,
        9.08170366e-01,  6.64305136e-01,  9.41023961e-01,  9.40018409e-01,
        6.64305136e-01,  9.42370520e-01,  9.45055840e-01,  6.72863310e-01,
        9.39804623e-01,  9.42556930e-01,  6.72863310e-01,  9.40566698e-01,
        9.43005238e-01,  6.72863310e-01,  9.38608668e-01,  9.43518438e-01,
        7.18363522e-01,  9.29969487e-01,  9.19607495e-01,  7.22502914e-01,
        9.23127885e-01,  9.17108306e-01,  7.25184499e-01,  9.27947160e-01,
        9.25341478e-01,  7.26291642e-01,  9.38007033e-01,  9.32290249e-01,
        7.27404272e-01,  9.32521308e-01,  9.28269751e-01,  7.27410093e-01,
        9.34745061e-01,  9.19652400e-01,  7.35705842e-01,  9.35130605e-01,
        9.33561347e-01,  7.36000939e-01,  9.36499106e-01,  9.33388393e-01,
        7.35705842e-01,  9.34523961e-01,  9.35987334e-01, -2.98155739e+00,
       -1.47487934e-02,  5.44983390e-01, -3.10012674e+00,  3.67552227e-02,
        4.84982592e-01, -3.10037603e+00,  9.10236943e-02,  5.41018605e-01,
       -3.03657096e+00,  1.45959816e-01,  5.82460270e-01, -3.06458156e+00,
        7.45395763e-02,  5.15498538e-01, -3.08752579e+00,  1.35325946e-01,
        5.41775353e-01, -2.94074184e+00,  8.19713180e-02,  4.30954160e-01,
       -2.94074311e+00,  5.17273954e-02,  4.56978465e-01, -3.02204778e+00,
        3.65235223e-02,  4.79634484e-01, -1.94423957e+00,  3.73279929e-01,
        5.42336450e-01, -2.05981629e+00,  5.28085085e-01,  5.68070260e-01,
       -2.35207814e+00,  2.94413529e-01,  6.23596374e-01, -2.30537286e+00,
        2.69970162e-01,  5.31786592e-01, -2.38805459e+00,  3.96651830e-01,
        5.85798195e-01, -2.21977187e+00,  2.58465191e-01,  5.18002798e-01,
       -2.50672336e+00,  4.83184675e-01,  6.26679621e-01, -2.49496271e+00,
        4.38702737e-01,  7.10414416e-01, -2.57814315e+00,  4.46509252e-01,
        7.19086112e-01, -2.54749763e+00,  4.74511638e-01,  4.89981281e-01,
       -2.49949635e+00,  4.10146721e-01,  5.23840602e-01, -2.44490232e+00,
        4.53695263e-01,  6.99762928e-01, -2.21671018e+00,  5.55964252e-01,
        7.02946117e-01, -2.41202384e+00,  6.02134397e-01,  6.84566301e-01,
       -2.33971649e+00,  5.85083280e-01,  7.24736717e-01, -2.45479514e+00,
        4.41503769e-01,  6.46174252e-01, -2.54940339e+00,  5.57071360e-01,
        6.20001212e-01, -2.61696306e+00,  5.40715820e-01,  6.44323521e-01])
# 回归指标评估
pre_test = model_best.predict(X_test)
mse_score = mse(pre_test,y_test)
mse_score
287348.77319651755
from sklearn import metrics
metrics.mean_absolute_error(y_test, pre_test)
233.48776706565366
# 模型拟合程度
plt.style.use("ggplot")  # 应用ggplot自带样式库
plt.figure(figsize=(10,7))  # 建立画布对象
plt.plot(np.arange(X_test.shape[0]), y_test, linestyle='-', color='k', label='true y')  # 画出原始变量的曲线
plt.plot(np.arange(X_test.shape[0]), pre_test, linestyle=':', color='m',
         label='predicted y')  # 画出预测变量曲线
plt.title('best model with mse of {}'.format(int(mse_score)))
plt.legend(loc=0)  # 设置图例位置
<matplotlib.legend.Legend at 0x20689ccfba8>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9X4qBYGF-1608625764333)(output_23_1.png)]

新数据集预测

New_X = np.array([[1, 1, 0, 1, 15, 0.5, 177, 0.66, 101, 798]])  # 要预测的新数据记录
print('{:*^60}'.format('Predicted orders:'))
print(model_best.predict(New_X).round(0))  # 打印输出预测值
*********************Predicted orders:**********************
[846.]

二、案例-基于集成算法AdaBoost、GradientBoosting、RandomForest和Bagging的投票组合模型的异常检测

说明

  • 描述:“代码实操”以及内容延伸部分源代码
  • 时间:2019-01-01
  • 作者:宋天龙(Tony Song)
  • 程序开发环境:win7 64位
  • Python版本:64位 3.7
  • 依赖库:numpy、pandas、sklearn、imblearn
  • 程序输入:abnormal_orders.txt
  • 程序输出:预测数据直接打印输出

程序

导入库

import numpy as np
import pandas as pd  # pandas库
from imblearn.over_sampling import SMOTE  # 过抽样处理库SMOTE
from sklearn.ensemble import VotingClassifier, GradientBoostingClassifier, \
    RandomForestClassifier  # 四种集成分类库和投票方法库
from sklearn.model_selection import StratifiedKFold, cross_val_score  # 导入交叉检验算法
from sklearn.preprocessing import OrdinalEncoder  # 字符串转数值

函数模块

# 日期和时间拓展
def datetime_exp(data):
    '''
    将日期和时间数据拓展出其他属性,例如星期几、周几、小时、分钟等。
    :param data: 数据集
    :return: 拓展后的属性矩阵
    '''
    date_set = [pd.datetime.strptime(dates, '%Y-%m-%d') for dates in
                data['order_date']]  # 将data中的order_date列转换为特定日期格式
    data['weekday_data'] = [data.weekday() for data in date_set]  # 周几
    data['daysinmonth_data'] = [data.day for data in date_set]  # 当月几号
    data['month_data'] = [data.month for data in date_set]  # 月份

    time_set = [pd.datetime.strptime(times, '%H:%M:%S') for times in
                data['order_time']]  # 将data中的order_time列转换为特定时间格式
    data['second_data'] = [data.second for data in time_set]  # 秒
    data['minute_data'] = [data.minute for data in time_set]  # 分钟
    data['hour_data'] = [data.hour for data in time_set]  # 小时
    return data.drop(['order_date','order_time'],axis=1)

读取数据

raw_data = pd.read_table('abnormal_orders.txt', delimiter=',')  # 读取数据集

数据审查

# 查看基本状态
print('{:*^60}'.format('Data overview:'))
print(raw_data.tail(2))  # 打印原始数据后2条
print('{:*^60}'.format('Data dtypes:'))
print(raw_data.dtypes)  # 打印数据类型
print('{:*^60}'.format('Data DESC:'))
print(raw_data.describe().round(2).T)  # 打印原始数据基本描述性信息
***********************Data overview:***********************
          order_id  order_date order_time       cat attribution      pro_id  \
134188  4285770012  2013-09-19   23:55:06      家居日用          GO  1000335947   
134189  4285770056  2013-05-20   23:58:59  生活电器厨卫电器          GO  1000009280   

       pro_brand  total_money  total_quantity order_source pay_type  \
134188       炊大师         79.0               1           抢购     合并支付   
134189        海尔        799.0               1           抢购     合并支付   

           user_id city  abnormal_label  
134188      shukun  东莞市               0  
134189  544975322_  海口市               0  
************************Data dtypes:************************
order_id            int64
order_date         object
order_time         object
cat                object
attribution        object
pro_id              int64
pro_brand          object
total_money       float64
total_quantity      int64
order_source       object
pay_type           object
user_id            object
city               object
abnormal_label      int64
dtype: object
*************************Data DESC:*************************
                   count          mean           std           min  \
order_id        134190.0  4.214285e+09  1.510533e+08  3.000316e+09   
pro_id          134190.0  3.404167e+09  3.287444e+09  1.000000e+09   
total_money     134189.0  6.601100e+02  2.901210e+03  5.000000e-01   
total_quantity  134190.0  1.200000e+00  3.230000e+00  1.000000e+00   
abnormal_label  134190.0  2.100000e-01  4.100000e-01  0.000000e+00   

                         25%           50%           75%           max  
order_id        4.203350e+09  4.276630e+09  4.281996e+09  4.285770e+09  
pro_id          1.000321e+09  1.000369e+09  8.001623e+09  8.002352e+09  
total_money     2.900000e+01  9.840000e+01  3.720000e+02  7.660000e+05  
total_quantity  1.000000e+00  1.000000e+00  1.000000e+00  1.000000e+03  
abnormal_label  0.000000e+00  0.000000e+00  0.000000e+00  1.000000e+00  
# 缺失值审查
na_cols = raw_data.isnull().any(axis=0)  # 查看每一列是否具有缺失值
print('{:*^60}'.format('NA Cols:'))
print(na_cols[na_cols==True])  # 查看具有缺失值的列
print('Total number of NA lines is: {0}'.format(
    raw_data.isnull().any(axis=1).sum()))  # 查看具有缺失值的行总记录数
**************************NA Cols:**************************
cat            True
pro_brand      True
total_money    True
city           True
dtype: bool
Total number of NA lines is: 1429
# 样本均衡审查
print('{:*^60}'.format('Labesl samples count:'))
print(raw_data.iloc[:, -1].value_counts())
*******************Labesl samples count:********************
0    105733
1     28457
Name: abnormal_label, dtype: int64

数据预处理

# Nan处理
drop_na_set = raw_data.dropna()  # 丢弃
# 丢弃订单ID列
drop_na_set = drop_na_set.drop(['order_id'],axis=1)
# 字符串转数值
convert_cols = ['cat', 'attribution', 'pro_id', 'pro_brand', 'order_source', 'pay_type','user_id', 'city']  # 定义要转换的列
enc = OrdinalEncoder()
drop_na_set[convert_cols]=enc.fit_transform(drop_na_set[convert_cols])
# 日期特征拓展
data_final = datetime_exp(drop_na_set)
# 分割测试集和训练集X和y
num = int(0.7*data_final.shape[0])
X_raw, y_raw = data_final.drop(['abnormal_label'],axis=1), data_final['abnormal_label']  
X_train,X_test = X_raw.iloc[:num,:],X_raw.iloc[num:,:]
y_train,y_test = y_raw.iloc[:num],y_raw.iloc[num:]
# 样本均衡
model_smote = SMOTE()  # 建立SMOTE模型对象
x_smote_resampled, y_smote_resampled = model_smote.fit_sample(X_train, y_train)  # 输入数据并作过抽样处理

模型训练

# 交叉检验
model_rf = RandomForestClassifier(max_features=0.8, random_state=0)  # 随机森林分类模型对象
model_gdbc = GradientBoostingClassifier(max_features=0.8, random_state=0)  # GradientBoosting分类模型对象
estimators = [('randomforest', model_rf), ('gradientboosting', model_gdbc)]  # 建立组合评估器列表
model_vot = VotingClassifier(estimators=estimators, voting='soft', weights=[0.9, 1.2],
                             n_jobs=-1)  # 建立组合评估模型
cv = StratifiedKFold(5, random_state=2)  # 设置交叉检验方法
cv_score = cross_val_score(model_gdbc, x_smote_resampled, y_smote_resampled, cv=cv)  # 交叉检验
print('{:*^60}'.format('Cross val scores:'),'\n',cv_score) # 打印每次交叉检验得分
print('Mean scores is: %.2f' % cv_score.mean())  # 打印平均交叉检验得分
*********************Cross val scores:********************** 
 [0.53665893 0.7974478  0.9446249  0.87760074 0.81158636]
Mean scores is: 0.79
# 训练模型
model_vot.fit(x_smote_resampled, y_smote_resampled)  # 模型训练
VotingClassifier(estimators=[('randomforest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
           ...    subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=-1, voting='soft',
         weights=[0.9, 1.2])

新数据集做预测

# 读取新数据集
X_new = pd.read_csv('new_abnormal_orders.csv')
# 丢弃订单ID列
X_new_drop = X_new.drop(['order_id'], axis=1)
# 字符串转数值
X_new_drop[convert_cols] = enc.transform(X_new_drop[convert_cols])
# 日期特征拓展
X_new_final = datetime_exp(X_new_drop)
# 预测结果
predict_label = model_vot.predict(X_new_final)
predict_proba = model_vot.predict_proba(X_new_final)
predict_np = np.hstack((predict_label.reshape(-1,1),predict_proba))
predict_pd = pd.DataFrame(predict_np,columns=['lables','proba_0','proba_1'])
print('{:*^60}'.format('Predicted Labesls:'), '\n', predict_pd)
*********************Predicted Labesls:********************* 
    lables   proba_0   proba_1
0     1.0  0.430347  0.569653
1     0.0  0.706741  0.293259
2     0.0  0.987728  0.012272
3     0.0  0.991092  0.008908
4     0.0  0.993395  0.006605
5     0.0  0.758719  0.241281
6     0.0  0.759313  0.240687

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

灯下夜无眠

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值