利用xgboost进行多元时间序列预测

最新推荐文章于 2024-08-15 00:20:42 发布

喝粥也会胖的唐僧

最新推荐文章于 2024-08-15 00:20:42 发布

阅读量9.8k

点赞数 8

分类专栏：机器学习

本文链接：https://blog.csdn.net/zhou_438/article/details/101350873

版权

机器学习专栏收录该内容

32 篇文章 7 订阅

订阅专栏

利用2019年ccf的数据集进行测试（乘用车销量预测）

主要思路进行划分出很多窗口，这里以1320*4条数据为一个窗口

因为每个省都有60个车型，一共22个省，就是1320，预测未来4个月，所以用的4

import pandas as pd

train_sales = pd.read_csv('C:\\Train\\train_sales_data.csv',header=0)
train_search = pd.read_csv('C:\\Train\\train_search_data.csv',header=0)
data=train_sales.merge(train_search,on=("adcode","model","regYear","regMonth"),how='inner')
data=data.drop(['province_x','province_y'], axis=1);
print(data)


import copy
categoricals = ['model', 'adcode','bodyType']
for feature in categoricals:
    df = copy.copy(pd.get_dummies(data[feature], drop_first=True))
    data= pd.concat([data, df], axis=1)
    data.drop(columns=feature, inplace=True)
print(data.head())  

print(data.iloc[1320*20:1320*24,:].values) 
def to_supervised(data):
    x = data.iloc[0:1320*20,:].values
    y = data.iloc[1320*4:1320*24,2].values
    return x, y

data_x,data_y=to_supervised(data)
print(data_x.shape)
print(data_y.shape)
train_x,test_x=data_x[0:1320*16],data_x[1320*16:26399+1]
train_y,test_y=data_y[0:1320*16],data_y[1320*16:26399+1]
print('-----------test_x------------')
print(test_x)
from numpy import nan
from numpy import isnan
from pandas import read_csv
from pandas import to_numeric

from sklearn.metrics import r2_score 
import lightgbm as lgb
# multivariate multi-step encoder-decoder lstm
from math import sqrt
from numpy import split
from numpy import array
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot

from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from numpy.random import seed 
import numpy as np
import xgboost as xgb
import pandas as pd
#from sklearn.metrics import roc_auc_score
from sklearn.metrics import explained_variance_score
import matplotlib.pyplot as plt

from hyperopt import STATUS_OK,STATUS_RUNNING, fmin, hp, tpe,space_eval, partial

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score


print("---------DMatrix----------")
dtrain = xgb.DMatrix(train_x, label=train_y)
dvalid = xgb.DMatrix(test_x, label=test_y)
##训练参数
SEED = 314159265
VALID_SIZE = 0.25
##训练参数
def model_run(params):
    print("starting...")
    print("Training with params: ")
    print(params)
    num_boost_round=int(params['n_estimators'])
    print("watchlist")
    watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
    print("training...")
    gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, verbose_eval=True)
    print("Validating...")
    check = gbm.predict(xgb.DMatrix(test_x), ntree_limit=gbm.best_iteration+1)
    print("explained_variance_score...")
    score = get_score (test_y, check)
    print("pr...")
    print('Check error value: {:.6f}'.format(score))
   ## print("Predict test set...")
   ## test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration+1)
    return {
        'loss': score,
        'status': STATUS_OK,
        'stats_running': STATUS_RUNNING
    }

def optimize(
             #trials, 
             random_state=SEED):

    
  ##  This is the optimization function that given a space (space here) of 
   ## hyperparameters and a scoring function (score here), finds the best hyperparameters.
   

    space = {
        'n_estimators': hp.quniform('n_estimators', 20, 60, 1),
        'eta': hp.quniform('eta', 0.02, 0.4, 0.02),
        'max_depth':  hp.choice('max_depth', np.arange(1, 20, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
        'eval_metric': 'rmse',
        'objective': 'reg:linear',
        'nthread': 4,
        'booster': 'gbtree',
        'tree_method': 'exact',
        'silent': 1,
        'seed': random_state
    }
    
    print("---------开始训练参数----------")
    best = fmin(model_run, space, algo=tpe.suggest, max_evals=2000)
    ##print("---------------"+best+"-----------")
    ##获取最优的参数
    best_params = space_eval(space, best)
    print("BEST PARAMETERS: " + str(best_params))
    return best_params
##定义计分函数
def get_score(pre,real):
    temp=[]
    pre_t=[]
    real_t=[]
    pre=pre.round().astype(int)
    
    for i in range(60):
        for j in range(4):
            pre_t.append(pre[1320*j+22*i:1320*j+22*(i+1)])
            real_t.append(real[1320*j+22*i:1320*j+22*(i+1)])
        temp.append(((mean_squared_error(pre_t,real_t))**0.5)/np.mean(real_t))
    return sum(temp)/60
print("---------开始优化参数----------")
best_params=optimize()
#print(test_prediction)
print("---------优化完成----------")
print(best_params)




##训练模型

##训练模型
print(best_params)
print("---------正式训练模型----------")
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model_gbm = xgb.train(best_params, dtrain, 180, evals=watchlist,early_stopping_rounds=50,verbose_eval=True)
print("---------正式预测模型----------")
print("Predict test set...")
test_prediction = model_gbm.predict(xgb.DMatrix(data.iloc[1320*20:1320*24,:].values), ntree_limit=model_gbm.best_iteration+1)
print("---------预测完成----------")
print(test_prediction)
print("---------预测完成----------")
print(best_params)
print(test_prediction.shape)
test_prediction=test_prediction.round().astype(int)
f = open('C:\\car_xg.txt', 'w')
total = 0
for id in range(1320*4):
    str1 =str(test_prediction[total])
    str1 += '\n'
    total += 1
    f.write(str1)
f.close()
print("持久化完成")
test_prediction1=model_gbm.predict(xgb.DMatrix(test_x), ntree_limit=model_gbm.best_iteration+1)
test_prediction1=test_prediction1.round().astype(int)
score =get_score(test_y, test_prediction1)
print(1-score)

效果虽然不咋地，但是停供了一个比较简单的思路，排名400+，大佬们可以在这个的基础上进行改进，也许可以提高很多，本人渣渣代码