CCF 乘用车细分市场销量预测 35个特征lgb单模0.6253
import sys
import numpy as np
import pandas as pd
import os
import gc
from tqdm import tqdm, tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder
import datetime
import time
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.cluster import KMeans
import xgboost as xgb
from sklearn.externals import joblib
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
train_sales = pd.read_csv('../input/train_sales_data.csv')
train_search = pd.read_csv('../input/train_search_data.csv')
train_user = pd.read_csv('../input/train_user_reply_data.csv')
evaluation_public = pd.read_csv('../input/evaluation_public.csv')
submit_example = pd.read_csv('../input/submit_example.csv')
data = pd.concat([train_sales, evaluation_public], ignore_index=True)
data = data.merge(train_search, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user, 'left', on=['model', 'regYear', 'regMonth'])
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
data['bodyType'] = data['model'].map(train_sales.drop_duplicates('model').set_index('model')['bodyType'])
for i in ['bodyType', 'model']:
data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']
def get_stat_feature(df_,):
df = df_.copy()
stat_feat = []
stat_feat_2=[]
stat_feat_3 = []
stat_feat_4 = []
df['model_adcode'] = df['adcode'] + df['model']
df['model_adcode_mt'] = df['model_adcode'] * 100 + df['mt']
for col in ['label']:
for i in [1,2,3,4,5,6,8,9,10,11,12,13,14,15,16]:
stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
stat_feat_2.append('shift_model_adcode_mt_{}_{}'.format(col,i))
df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])
for col in ['popularity']:
for i in [1,2,3,10,11,12]:
stat_feat.append('shift_model_adcode_mt_{}_{}'.format(col,i))
stat_feat_2.append('shift_model_adcode_mt_{}_{}'.format(col,i))
df['model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'] + i
df_last = df[~df[col].isnull()].set_index('model_adcode_mt_{}_{}'.format(col,i))
df['shift_model_adcode_mt_{}_{}'.format(col,i)] = df['model_adcode_mt'].map(df_last[col])
df["increase16_4"]=(df["shift_model_adcode_mt_label_16"]-df["shift_model_adcode_mt_label_4"])/df["shift_model_adcode_mt_label_16"]
mean=pd.DataFrame(df.groupby(["model","mt"]).shift_model_adcode_mt_label_12.agg({"mean_province":"mean",
"min_province":"min",}))
df=pd.merge(df,mean,on=["model","mt"],how="left")
mean=pd.DataFrame(df.groupby(["model","mt"]).shift_model_adcode_mt_label_15.agg({"mean_province_15":"mean",}))
df=pd.merge(df,mean,on=["model","mt"],how="left")
mean=pd.DataFrame(df.groupby(["model","mt"]).shift_model_adcode_mt_label_3.agg({"mean_province_3":"mean",}))
df=pd.merge(df,mean,on=["model","mt"],how="left")
mean=pd.DataFrame(df.groupby(["model","mt"]).shift_model_adcode_mt_label_16.agg({"mean_province_16":"mean",}))
df=pd.merge(df,mean,on=["model","mt"],how="left")
mean=pd.DataFrame(df.groupby(["model","mt"]).shift_model_adcode_mt_label_4.agg({"mean_province_4":"mean",}))
df=pd.merge(df,mean,on=["model","mt"],how="left")
mean=pd.DataFrame(df.groupby(["adcode","mt"]).shift_model_adcode_mt_label_15.agg({"mean_Month_15":"mean"}))
df=pd.merge(df,mean,on=["adcode","mt"],how="left")
mean=pd.DataFrame(df.groupby(["adcode","mt"]).shift_model_adcode_mt_label_3.agg({"mean_Month_3":"mean"}))
df=pd.merge(df,mean,on=["adcode","mt"],how="left")
mean=pd.DataFrame(df.groupby(["adcode","mt"]).shift_model_adcode_mt_label_16.agg({"mean_Month_16":"mean"}))
df=pd.merge(df,mean,on=["adcode","mt"],how="left")
mean=pd.DataFrame(df.groupby(["adcode","mt"]).shift_model_adcode_mt_label_4.agg({"mean_Month_4":"mean"}))
df=pd.merge(df,mean,on=["adcode","mt"],how="left")
df["increase_mean_province_16_4"]=(df["mean_province_16"]-df["mean_province_4"])/df["mean_province_16"]
df["increase_mean_province_15_3"]=(df["mean_province_15"]-df["mean_province_3"])/df["mean_province_15"]
df["increase_mean_Month_15_3"]=(df["mean_Month_15"]-df["mean_Month_3"])/df["mean_Month_15"]
df["increase_mean_Month_16_4"]=(df["mean_Month_16"]-df["mean_Month_4"])/df["mean_Month_16"]
mean=pd.DataFrame(df.groupby(["adcode","mt"]).shift_model_adcode_mt_label_12.agg({"mean_Month":"mean",}))
df=pd.merge(df,mean,on=["adcode","mt"],how="left")
df["sum_1"]=df["shift_model_adcode_mt_label_11"].values+df["shift_model_adcode_mt_label_12"].values+df["shift_model_adcode_mt_label_1"].values+df["shift_model_adcode_mt_label_2"].values
df["sum_2"]=df["shift_model_adcode_mt_label_12"].values+df["shift_model_adcode_mt_label_1"].values
df["sum_3"]=df["shift_model_adcode_mt_label_3"].values+df["shift_model_adcode_mt_label_2"].values+df["shift_model_adcode_mt_label_1"].values
stat_feat_4 = ["mean_province","min_province","mean_Month","sum_1","sum_2","sum_3","increase16_4",
"increase_mean_province_15_3","increase_mean_Month_15_3","increase_mean_province_16_4","increase_mean_Month_16_4"]
stat_feat.remove("shift_model_adcode_mt_label_15")
stat_feat.remove("shift_model_adcode_mt_label_16")
return df,stat_feat+stat_feat_3+stat_feat_4
def score(data, pred='pred_label', label='label', group='model'):
data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
data_agg = data.groupby('model').agg({
pred: list,
label: [list, 'mean']
}).reset_index()
data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
nrmse_score = []
for raw in data_agg[['{0}_list'.format(pred), '{0}_list'. format(label), '{0}_mean'.format(label)]].values:
nrmse_score.append(mse(raw[0], raw[1]) ** 0.5 / raw[2] )
print(1 - np.mean(nrmse_score))
return 1 - np.mean(nrmse_score)
def get_model_type(train_x,train_y,valid_x,valid_y,m_type='lgb',i=0):
if m_type == 'lgb':
model = lgb.LGBMRegressor(
num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='mse',
max_depth=-1, learning_rate=0.05, min_child_samples=10, random_state=2019,
n_estimators=2000, subsample=0.9, colsample_bytree=0.7,num_threads= -1,
)
model.fit(train_x, train_y,
eval_set=[(train_x, train_y),(valid_x, valid_y)],
categorical_feature=cate_feat,
early_stopping_rounds=100, verbose=100)
joblib.dump(model, "lgbm_"+str(i)+".m")
print("lgb_model_%d has saved"%i)
elif m_type == 'xgb':
model = xgb.XGBRegressor(
max_depth=5 , learning_rate=0.05, n_estimators=2000,
objective='reg:gamma', tree_method = 'hist',subsample=0.9,
colsample_bytree=0.7, min_child_samples=5,eval_metric = 'rmse'
)
model.fit(train_x, train_y,
eval_set=[(train_x, train_y),(valid_x, valid_y)],
early_stopping_rounds=100, verbose=100)
joblib.dump(model, "xgbm_"+str(i)+".m")
print("xgb_model_%d has saved"%i)
return model
def get_train_model(df_, m, m_type='lgb',i=0):
df = df_.copy()
st = 13
all_idx = (df['mt'].between(st , m-1))
train_idx = (df['mt'].between(st , m-5))
valid_idx = (df['mt'].between(m-4, m-4))
test_idx = (df['mt'].between(m , m ))
train_x = df[train_idx][features]
train_y = df[train_idx]['n_label']
valid_x = df[valid_idx][features]
valid_y = df[valid_idx]['n_label']
model = get_model_type(train_x,train_y,valid_x,valid_y,m_type,i)
df['pred_label'] = np.expm1(model.predict(df[features]))
best_score = score(df[valid_idx])
if m_type == 'lgb':
model.n_estimators = model.best_iteration_ + 100
model.fit(df[all_idx][features], df[all_idx]['n_label'], categorical_feature=cate_feat)
elif m_type == 'xgb':
model.n_estimators = model.best_iteration + 100
model.fit(df[all_idx][features], df[all_idx]['n_label'])
df['forecastVolum'] = np.expm1(model.predict(df[features]))
print('valid mean:',df[valid_idx]['pred_label'].mean())
print('true mean:',df[valid_idx]['label'].mean())
print('test mean:',df[test_idx]['forecastVolum'].mean())
sub = df[test_idx][['id']]
sub['forecastVolum'] = df[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
print(sub.shape)
return sub,df[valid_idx]['pred_label']
for month in [25,26,27,28]:
m_type = 'lgb'
data['n_label'] = np.log1p(data['label'])
data_df, stat_feat = get_stat_feature(data)
num_feat = ['regYear'] + stat_feat
cate_feat = ['adcode','bodyType','model','regMonth',]
if m_type == 'lgb':
for i in cate_feat:
data_df[i] = data_df[i].astype('category')
elif m_type == 'xgb':
lbl = LabelEncoder()
for i in tqdm(cate_feat):
data_df[i] = lbl.fit_transform(data_df[i].astype(str))
features = num_feat + cate_feat
print(len(features), len(set(features)))
sub,val_pred = get_train_model(data_df, month, m_type,month-24)
data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'salesVolume'] = sub['forecastVolum'].values
data.loc[(data.regMonth==(month-24))&(data.regYear==2018), 'label' ] = sub['forecastVolum'].values
sub = data.loc[(data.regMonth>=1)&(data.regYear==2018), ['id','salesVolume']]
sub.columns = ['id','forecastVolum']
sub[['id','forecastVolum']].round().astype(int).to_csv('../input/B_res.csv', index=False)
my_data=pd.read_csv('../input/B_res.csv')
my_data["forecastVolum"]=my_data["forecastVolum"]*0.79-5
my_data["forecastVolum"]=(my_data["forecastVolum"]).astype(int)
my_data.loc[my_data[my_data["forecastVolum"] < 4].index,"forecastVolum"]=4
my_data.loc[my_data[my_data["forecastVolum"] >9000].index,"forecastVolum"]=9000
my_data.to_csv('../input/submit.csv',index=0)