销量预测04(数据的初步处理:使用机器学习,在此考虑了时间窗作为输入)

导入计算库

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
plt.style.use("fivethirtyeight")
plt.rcParams["font.sans-serif"] = ["Microsoft YaHei"]
plt.rcParams["axes.unicode_minus"] = False
from sklearn.preprocessing import OneHotEncoder

import statsmodels.api as sm 
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from pandas.tseries.offsets import DateOffset


from statsmodels.tsa.arima.model import ARIMA

import warnings
warnings.filterwarnings("ignore")

导入数据

path_train = "../preocess_data/train_data_o.csv"
path_test = "../data/test_data.csv"
data  = pd.read_csv(path_train)
data_test = pd.read_csv(path_test)
data["运营日期"] = pd.to_datetime(data["运营日期"] )
data_test["运营日期"] = pd.to_datetime(data_test["日期"])
data.drop(["行ID","日期"],axis=1,inplace=True) 
data_test.drop(["行ID","日期"],axis=1,inplace=True)

折扣编码

enc = OneHotEncoder(drop="if_binary")
enc.fit(data["折扣"].values.reshape(-1,1))
enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()
array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [0.],
       [0.]])
data["折扣"] = enc.transform(data["折扣"].values.reshape(-1,1)).toarray()
data_test["折扣"]  = enc.transform(data_test["折扣"].values.reshape(-1,1)).toarray()

日期衍生

def time_derivation(t,col="运营日期"):
    t["year"] = t[col].dt.year
    t["month"] = t[col].dt.month
    t["day"] = t[col].dt.day
    t["quarter"] = t[col].dt.quarter
    t["weekofyear"] = t[col].dt.weekofyear
    t["dayofweek"] = t[col].dt.dayofweek+1
    t["weekend"] = (t["dayofweek"]>5).astype(int)
    return t

data_train  = time_derivation(data)
data_test_  = time_derivation(data_test)

对每家店进行探索

对第一家尝试

# 训练集合
data_train_1 = data_train[data_train["商店ID"] ==1]
# 测试集合
data_test_1 = data_test_[data_test_["商店ID"] ==1]
plt.figure(figsize=(16,8))
plt.plot(data_train_1["运营日期"],data_train_1["销量"])
plt.xlabel("日期",fontsize= 20)
plt.ylabel("销量",fontsize= 20)
plt.title("1号店的销量",fontsize=20)
Text(0.5, 1.0, '1号店的销量')


png

开始尝试

data_train_1.head()
商店ID商店类型位置地区节假日折扣销量运营日期yearmonthdayquarterweekofyeardayofweekweekend
01S1L3R111.07011.842018-01-012018111110
6071S1L3R101.042369.002018-01-022018121120
10461S1L3R101.050037.002018-01-032018131130
12071S1L3R101.044397.002018-01-042018141140
17521S1L3R101.047604.002018-01-052018151150

复制获得数据

data_train__1 = data_train_1.copy()
data_test__1 = data_test_1.copy()

删除不变的属性

data_train__1.drop(["商店类型","商店ID","位置","地区","运营日期"],axis=1,inplace=True)
data_test__1.drop(["商店类型","商店ID","位置","地区","运营日期"],axis=1,inplace=True)

在这里留下一个思考的点, 其实节假日的附近几天的属性也会对预测结果产生一定影响。

data_train__1.shape
(516, 10)
data_train__1
节假日折扣销量yearmonthdayquarterweekofyeardayofweekweekend
011.07011.842018111110
60701.042369.002018121120
104601.050037.002018131130
120701.044397.002018141140
175201.047604.002018151150
.................................
18656901.033075.00201952722210
18716501.037317.00201952822220
18739101.044652.00201952922230
18796201.042387.00201953022240
18811311.039843.78201953122250

516 rows × 10 columns

data_train__1["diff_1"] = data_train__1["销量"].diff(10)
data_train__1 = data_train__1.dropna()

# data_test__1["diff_1"] = data_test__1["销量"].diff(1)
# data_test__1 = data_test__1.dropna()

将数据集变为有监督数据集

def series_to_supervisied_(data,step_in,step_out,dropnan = True):
    """
    :param data: 观测的序列,类型为列表或者二维的numpy数组
    :param step_in: 作为输入滞后观测数量(x)
    :param step_out: 作为输出的观测值数量(y)
    :param  dropnan: 是否具有Nan的行,默认为True
    return 监督学习的重组得到的dataframe列
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = data
    cols = []
    names = []
    # 输入序列[(t-n),(t-n+1),(t-n+2)..(t-1)]
    for i in range(step_in,0,-1):
        cols.append(df.shift(i))
        names+=[f"{name}-({i})step" for name in df.columns]
    # 输出序列[t,(t+1),(t+2)...(t+n)]
    for i in range(0,step_out):
        cols.append(df.shift(-i))
        if i ==0:
            names+=[f"{name}+(0)step" for name in df.columns]
        else:
            names+=[f"{name}+({i})step" for name in df.columns]
    
    df_re = pd.concat(cols,axis=1)
    df_re.columns = names
    if dropnan:
        df_re.dropna(inplace=True)
    
    return df_re
data_step = series_to_supervisied_(data_train__1,step_in= 10,step_out=2,dropnan = True)
data_step 
节假日-(10)step折扣-(10)step销量-(10)stepyear-(10)stepmonth-(10)stepday-(10)stepquarter-(10)stepweekofyear-(10)stepdayofweek-(10)stepweekend-(10)step...折扣+(1)step销量+(1)stepyear+(1)stepmonth+(1)stepday+(1)stepquarter+(1)stepweekofyear+(1)stepdayofweek+(1)stepweekend+(1)stepdiff_1+(1)step
76580.00.036873.002018.01.011.01.02.04.00.0...0.044178.752018.01.022.01.04.01.00.018534.75
77910.00.025644.002018.01.012.01.02.05.00.0...0.028086.002018.01.023.01.04.02.00.0-12804.00
83370.01.040890.002018.01.013.01.02.06.01.0...0.047835.002018.01.024.01.04.03.00.08580.60
86101.01.039254.402018.01.014.01.02.07.01.0...1.045384.002018.01.025.01.04.04.00.0-6609.00
88210.01.051993.002018.01.015.01.03.01.00.0...1.025868.882018.01.026.01.04.05.00.0-11847.12
..................................................................
1861520.01.047619.002019.05.016.02.020.04.00.0...1.033075.002019.05.027.02.022.01.00.0-23178.00
1865690.01.056253.002019.05.017.02.020.05.00.0...1.037317.002019.05.028.02.022.02.00.0-15560.82
1871651.01.052877.822019.05.018.02.020.06.01.0...1.044652.002019.05.029.02.022.03.00.0-24081.00
1873910.01.068733.002019.05.019.02.020.07.01.0...1.042387.002019.05.030.02.022.04.00.018891.00
1879620.00.023496.002019.05.020.02.021.01.00.0...1.039843.782019.05.031.02.022.05.00.03231.78

495 rows × 132 columns

list(data_step.columns)[-11:]
['节假日+(1)step',
 '折扣+(1)step',
 '销量+(1)step',
 'year+(1)step',
 'month+(1)step',
 'day+(1)step',
 'quarter+(1)step',
 'weekofyear+(1)step',
 'dayofweek+(1)step',
 'weekend+(1)step',
 'diff_1+(1)step']
y_columns = "销量+(1)step"
y = data_step[y_columns]
x = data_step[list(data_step.columns)[:-11]]
data_ = pd.concat([x,y],axis=1)
data_
节假日-(10)step折扣-(10)step销量-(10)stepyear-(10)stepmonth-(10)stepday-(10)stepquarter-(10)stepweekofyear-(10)stepdayofweek-(10)stepweekend-(10)step...销量+(0)stepyear+(0)stepmonth+(0)stepday+(0)stepquarter+(0)stepweekofyear+(0)stepdayofweek+(0)stepweekend+(0)stepdiff_1+(0)step销量+(1)step
76580.00.036873.002018.01.011.01.02.04.00.0...44625.00201812113717752.0044178.75
77910.00.025644.002018.01.012.01.02.05.00.0...44178.752018122141018534.7528086.00
83370.01.040890.002018.01.013.01.02.06.01.0...28086.0020181231420-12804.0047835.00
86101.01.039254.402018.01.014.01.02.07.01.0...47835.00201812414308580.6045384.00
88210.01.051993.002018.01.015.01.03.01.00.0...45384.0020181251440-6609.0025868.88
..................................................................
1861520.01.047619.002019.05.016.02.020.04.00.0...25035.00201952622171-22584.0033075.00
1865690.01.056253.002019.05.017.02.020.05.00.0...33075.00201952722210-23178.0037317.00
1871651.01.052877.822019.05.018.02.020.06.01.0...37317.00201952822220-15560.8244652.00
1873910.01.068733.002019.05.019.02.020.07.01.0...44652.00201952922230-24081.0042387.00
1879620.00.023496.002019.05.020.02.021.01.00.0...42387.0020195302224018891.0039843.78

495 rows × 122 columns

划分训练集测试集

lens = -1
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
data_train_,data_test_ = train_test_split(data_,test_size=0.2,shuffle=False)
data_train_  = shuffle(data_train_,random_state=1412)
xtrain,ytrain = data_train_.iloc[:,:lens],data_train_.iloc[:,lens]
xtest,ytest = data_test_.iloc[:,:lens],data_test_.iloc[:,lens]

归一化

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(ytrain.values.reshape(-1,1))
y_train = scaler.transform(ytrain.values.reshape(-1,1))
y_test = scaler.transform(ytest.values.reshape(-1,1))

随机森林

from sklearn.ensemble import RandomForestRegressor
rf_clf = RandomForestRegressor(max_depth=12, 
                               min_impurity_decrease=0.0, 
                               n_estimators= 300)
rf_clf.fit(xtrain,ytrain)
rf_clf.score(xtest,ytest)
0.43699715354836455
from sklearn.metrics import mean_squared_error
mean_squared_error(ytest,rf_clf.predict(xtest))**0.5
10330.02670862774
mean_squared_error(ytrain,rf_clf.predict(xtrain))**0.5
3440.408502696517
def symmetric_mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.sum(np.abs(y_true - y_pred) * 2) / np.sum(np.abs(y_true) + np.abs(y_pred))

def prophet_smape(y_true, y_pred):
    smape_val = symmetric_mean_absolute_percentage_error(y_true, y_pred)
    return 'SMAPE', smape_val, False
 prophet_smape(ytrain,rf_clf.predict(xtrain))
('SMAPE', 0.07417421103653893, False)
 prophet_smape(ytest,rf_clf.predict(xtest))

xgboost

from xgboost import XGBRegressor
xgb_clf = XGBRegressor(max_depth=12, 
                               n_estimators=500)
xgb_clf.fit(xtrain,ytrain)
xgb_clf.score(xtest,ytest)
0.40055294233563
mean_squared_error(ytest,xgb_clf.predict(xtest))**0.5
10659.125252942618
mean_squared_error(ytrain,xgb_clf.predict(xtrain))**0.5
0.0036928046625203763
 prophet_smape(ytest,xgb_clf.predict(xtest))
('SMAPE', 0.22077657322671448, False)
 prophet_smape(ytrain,xgb_clf.predict(xtrain))
('SMAPE', 7.480844639369475e-08, False)

用一下贝叶斯优化看一下,是不是历史时间步长的问题?

from hyperopt import hp,fmin,tpe,Trials,partial
from hyperopt.early_stop import no_progress_loss


# 定义目标函数:
def hpyeropt_objective(params):

    data_step = series_to_supervisied_(data_train__1,step_in = int(params["time_step"]),step_out=2,dropnan = True)
    
    y_columns = "销量+(0)step"
    y = data_step[y_columns]
    x = data_step[list(data_step.columns)[:-20]]
    data_ = pd.concat([x,y],axis=1)
    
    from sklearn.model_selection import train_test_split
    from sklearn.utils import shuffle
    data_train,data_test = train_test_split(data_,test_size=0.2,shuffle=False)
    data_train  = shuffle(data_train,random_state=1412)
    xtrain,ytrain = data_train.iloc[:,:-20],data_train.iloc[:,-1]
    xtest,ytest = data_test.iloc[:,:-20],data_test.iloc[:,-1]
    
    # 归一化
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(ytrain.values.reshape(-1,1))
    ytrain = scaler.transform(ytrain.values.reshape(-1,1))
    ytest = scaler.transform(ytest.values.reshape(-1,1))

    
    clf = RandomForestRegressor(n_estimators=int(params["n_estimators"]),
                                max_depth=int(params["max_depth"]),
#                                max_features=int(min(params["max_features"],len(xtrain.columns))),
                               min_impurity_decrease=params["min_impurity_decrease"],
                               random_state=1412,
                               verbose=False,
                               n_jobs=-1
                               ).fit(xtrain,ytrain)
    
    scores = clf.score(xtest,ytest)
    return -scores   


#定义参数空间
params_grid = {"n_estimators":hp.quniform("n_estimators",10,1000,20),
                   "max_depth":hp.quniform("max_depth",5,25,1),
#                    "max_features":hp.quniform("max_features",10,10000,1),
                   "min_impurity_decrease":hp.quniform("min_impurity_decrease",0,5,1),
              
           
               "time_step":hp.quniform("time_step",10,200,5)
              }

#定义迭代
def param_hyperopt(max_evals = 100):
    trials = Trials()
    early_stop_fn =no_progress_loss(50)
    params_best = fmin(hpyeropt_objective,
                      space=params_grid,
                       algo=tpe.suggest,
                       max_evals=max_evals,
                       verbose=True,
                       trials=trials,
                       early_stop_fn=early_stop_fn
                      )
    print("\n","\n","best params:",params_best,"\n")
    return params_best,trials
params_best,trials =  param_hyperopt(max_evals = 300)
 34%|███████████████                             | 103/300 [02:59<05:43,  1.75s/trial, best loss: -0.48958559215129405]


best params: {‘max_depth’: 9.0, ‘min_impurity_decrease’: 0.0, ‘n_estimators’: 660.0, ‘time_step’: 10.0}

目前来讲的话,效果都不是很好,那么接下来从时间角度来想的话,在每个时间窗内进行差分以及其他的特征衍生,尽量的提取一些其他的特征! 至此先考虑下另外一个方案。

data_train__1["销量"].diff()-data_train__1["销量"].diff()
节假日折扣销量yearmonthdayquarterweekofyeardayofweekweekend
01.00.0-35357.160.00.0-1.00.00.0-1.00.0
6070.00.0-7668.000.00.0-1.00.00.0-1.00.0
10460.00.05640.000.00.0-1.00.00.0-1.00.0
12070.00.0-3207.000.00.0-1.00.00.0-1.00.0
17520.01.023109.000.00.0-1.00.00.0-1.0-1.0
.................................
1865690.00.0-4242.000.00.0-1.00.00.0-1.00.0
1871650.00.0-7335.000.00.0-1.00.00.0-1.00.0
1873910.00.02265.000.00.0-1.00.00.0-1.00.0
187962-1.00.02543.220.00.0-1.00.00.0-1.00.0
188113NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN

516 rows × 10 columns

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值