利用2019年ccf的数据集进行测试(乘用车销量预测)
主要思路进行划分出很多窗口,这里以1320*4条数据为一个窗口
因为每个省都有60个车型,一共22个省,就是1320,预测未来4个月,所以用的4
import pandas as pd
train_sales = pd.read_csv('C:\\Train\\train_sales_data.csv',header=0)
train_search = pd.read_csv('C:\\Train\\train_search_data.csv',header=0)
data=train_sales.merge(train_search,on=("adcode","model","regYear","regMonth"),how='inner')
data=data.drop(['province_x','province_y'], axis=1);
print(data)
import copy
categoricals = ['model', 'adcode','bodyType']
for feature in categoricals:
df = copy.copy(pd.get_dummies(data[feature], drop_first=True))
data= pd.concat([data, df], axis=1)
data.drop(columns=feature, inplace=True)
print(data.head())
print(data.iloc[1320*20:1320*24,:].values)
def to_supervised(data):
x = data.iloc[0:1320*20,:].values
y = data.iloc[1320*4:1320*24,2].values
return x, y
data_x,data_y=to_supervised(data)
print(data_x.shape)
print(data_y.shape)
train_x,test_x=data_x[0:1320*16],data_x[1320*16:26399+1]
train_y,test_y=data_y[0:1320*16],data_y[1320*16:26399+1]
print('-----------test_x------------')
print(test_x)
from numpy import nan
from numpy import isnan
from pandas import read_csv
from pandas import to_numeric
from sklearn.metrics import r2_score
import lightgbm as lgb
# multivariate multi-step encoder-decoder lstm
from math import sqrt
from numpy import split
from numpy import array
from pandas import read_csv
from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from numpy.random import seed
import numpy as np
import xgboost as xgb
import pandas as pd
#from sklearn.metrics import roc_auc_score
from sklearn.metrics import explained_variance_score
import matplotlib.pyplot as plt
from hyperopt import STATUS_OK,STATUS_RUNNING, fmin, hp, tpe,space_eval, partial
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
print("---------DMatrix----------")
dtrain = xgb.DMatrix(train_x, label=train_y)
dvalid = xgb.DMatrix(test_x, label=test_y)
##训练参数
SEED = 314159265
VALID_SIZE = 0.25
##训练参数
def model_run(params):
print("starting...")
print("Training with params: ")
print(params)
num_boost_round=int(params['n_estimators'])
print("watchlist")
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
print("training...")
gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, verbose_eval=True)
print("Validating...")
check = gbm.predict(xgb.DMatrix(test_x), ntree_limit=gbm.best_iteration+1)
print("explained_variance_score...")
score = get_score (test_y, check)
print("pr...")
print('Check error value: {:.6f}'.format(score))
## print("Predict test set...")
## test_prediction = gbm.predict(xgb.DMatrix(test[features]), ntree_limit=gbm.best_iteration+1)
return {
'loss': score,
'status': STATUS_OK,
'stats_running': STATUS_RUNNING
}
def optimize(
#trials,
random_state=SEED):
## This is the optimization function that given a space (space here) of
## hyperparameters and a scoring function (score here), finds the best hyperparameters.
space = {
'n_estimators': hp.quniform('n_estimators', 20, 60, 1),
'eta': hp.quniform('eta', 0.02, 0.4, 0.02),
'max_depth': hp.choice('max_depth', np.arange(1, 20, dtype=int)),
'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
'eval_metric': 'rmse',
'objective': 'reg:linear',
'nthread': 4,
'booster': 'gbtree',
'tree_method': 'exact',
'silent': 1,
'seed': random_state
}
print("---------开始训练参数----------")
best = fmin(model_run, space, algo=tpe.suggest, max_evals=2000)
##print("---------------"+best+"-----------")
##获取最优的参数
best_params = space_eval(space, best)
print("BEST PARAMETERS: " + str(best_params))
return best_params
##定义计分函数
def get_score(pre,real):
temp=[]
pre_t=[]
real_t=[]
pre=pre.round().astype(int)
for i in range(60):
for j in range(4):
pre_t.append(pre[1320*j+22*i:1320*j+22*(i+1)])
real_t.append(real[1320*j+22*i:1320*j+22*(i+1)])
temp.append(((mean_squared_error(pre_t,real_t))**0.5)/np.mean(real_t))
return sum(temp)/60
print("---------开始优化参数----------")
best_params=optimize()
#print(test_prediction)
print("---------优化完成----------")
print(best_params)
##训练模型
##训练模型
print(best_params)
print("---------正式训练模型----------")
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model_gbm = xgb.train(best_params, dtrain, 180, evals=watchlist,early_stopping_rounds=50,verbose_eval=True)
print("---------正式预测模型----------")
print("Predict test set...")
test_prediction = model_gbm.predict(xgb.DMatrix(data.iloc[1320*20:1320*24,:].values), ntree_limit=model_gbm.best_iteration+1)
print("---------预测完成----------")
print(test_prediction)
print("---------预测完成----------")
print(best_params)
print(test_prediction.shape)
test_prediction=test_prediction.round().astype(int)
f = open('C:\\car_xg.txt', 'w')
total = 0
for id in range(1320*4):
str1 =str(test_prediction[total])
str1 += '\n'
total += 1
f.write(str1)
f.close()
print("持久化完成")
test_prediction1=model_gbm.predict(xgb.DMatrix(test_x), ntree_limit=model_gbm.best_iteration+1)
test_prediction1=test_prediction1.round().astype(int)
score =get_score(test_y, test_prediction1)
print(1-score)
效果虽然不咋地,但是停供了一个比较简单的思路,排名400+,大佬们可以在这个的基础上进行改进,也许可以提高很多,本人渣渣代码