2019 CCF 乘用车销量预测

赛题通道:https://www.datafountain.cn/competitions/352

baseline单模0.57,祝各位好运!!!

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_squared_error as mse
from tqdm import tqdm, tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

path = '/个人/比赛/乘用车销量预测'
train_sales_data = pd.read_csv(path+'/TrainData/train_sales_data.csv', engine='python')
train_search_data = pd.read_csv(path+'/TrainData/train_search_data.csv', engine='python')
train_user_reply_data = pd.read_csv(path+'/TrainData/train_user_reply_data.csv', engine='python')
test = pd.read_csv(path+'/TestPublic/evaluation_public.csv', engine='python')

# train_sales_data\train_search_data\train_user_reply_data  拼接
data = pd.merge(train_sales_data, train_search_data, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = pd.merge(data, train_user_reply_data, 'left', on=['model', 'regYear', 'regMonth'])

# col, col2, col3 中 ,设1.5倍四分位距之外的数据为异常值,用上下四分位数的均值填充
col, col2, col3 = ['popularity', 'carCommentVolum', 'newsReplyVolum']
col_per = np.percentile(data[col],(25,75))
diff = 1.5*(col_per[1] - col_per[0])
col_per_in = (data[col] >= col_per[0] - diff) & (data[col] <= col_per[1] + diff)

col_per2 = np.percentile(data[col2],(25,75))
diff2 = 1.5*(col_per2[1] - col_per2[0])
col_per_in2 = (data[col2] >= col_per2[0] - diff2) & (data[col2] <= col_per2[1] + diff2)

col_per3 = np.percentile(data[col3],(25,75))
diff3 = 1.5*(col_per3[1] - col_per3[0])
col_per_in3 = (data[col3] >= col_per3[0] - diff3) & (data[col3] <= col_per3[1] + diff3)

data.loc[~col_per_in, col] = col_per.mean()
data.loc[~col_per_in2, col2] = col_per2.mean()
data.loc[~col_per_in3, col3] = col_per3.mean()

# 统计销量
data['bt_ry_mean'] = data.groupby(['bodyType','regYear'])['salesVolume'].transform('mean')
data['ad_ry_mean'] = data.groupby(['adcode','regYear'])['salesVolume'].transform('mean')
data['md_ry_mean'] = data.groupby(['model','regYear'])['salesVolume'].transform('mean')


'''
一、lgb预测
'''
# 测试集并入
data = pd.concat([data, test], ignore_index=True)
data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
del data['salesVolume'], data['forecastVolum']
# 填补测试集的车身类型
data['bodyType'] = data['model'].map(train_sales_data.drop_duplicates('model').set_index('model')['bodyType'])
# 编码 bodyType、model
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))
# 距离2016年的时间间隔,月数
data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

shift_feat = []
data['model_adcode'] = data['adcode'] + data['model']
data['model_adcode_mt'] = data['model_adcode'] * 100 + data['mt']

# 填充测试集特征值
for col in ['carCommentVolum','newsReplyVolum','popularity','bt_ry_mean','ad_ry_mean', 'md_ry_mean']:
    lgb_col_na = pd.isnull(data[col])
    data[col] = data[col].replace(0,1)
    data.loc[lgb_col_na,col] = \
    ((((data.loc[(data['regYear'].isin([2017]))&(data['regMonth'].isin([1,2,3,4])), col].values /
    data.loc[(data['regYear'].isin([2016]))&(data['regMonth'].isin([1,2,3,4])), col].values)))*
    data.loc[(data['regYear'].isin([2017]))&(data['regMonth'].isin([1,2,3,4])), col].values * 1.03).round()


# 每年的新年在第几月份
data['happyNY'] = 0
data.loc[(data['regYear'].isin([2016,2018])&data['regMonth'].isin([2])),'happyNY'] = 1
data.loc[(data['regYear'].isin([2017])&data['regMonth'].isin([1])),'happyNY'] = 1


# label 下移12个月,则测试集填充上了label
for i in [4]:
    shift_feat.append('shift_model_adcode_mt_label_{0}'.format(i))
    data['model_adcode_mt_{0}'.format(i)] = data['model_adcode_mt'] + i
    data_last = data[~data.label.isnull()].set_index('model_adcode_mt_{0}'.format(i))
    data['shift_model_adcode_mt_label_{0}'.format(i)] = data['model_adcode_mt'].map(data_last['label'])

data.loc[pd.isnull(data['shift_model_adcode_mt_label_4']),'shift_model_adcode_mt_label_4'] = \
((data.loc[(data.regMonth.isin([1,2,3,4]))&(data.regYear.isin([2016])),'label'].values/
 data.loc[(data.regMonth.isin([1,2,3,4]))&(data.regYear.isin([2017])),'label'].values)*
data.loc[(data.regMonth.isin([1,2,3,4]))&(data.regYear.isin([2016])),'label'].values).round()

# 根据月份添加权重值
a = 6; b = 4
data['weightMonth'] = data['regMonth'].map({1:a, 2:a, 3:a, 4:a,
                                            5:b, 6:b, 7:b, 8:b, 9:b, 10:b, 11:b, 12:b,})



def score(data):
    pred = data.groupby(['adcode', 'model'])['pred_label'].agg(lambda x: list(x))
    label = data.groupby(['adcode', 'model'])['label'].agg(lambda x: list(x))
    label_mean = data.groupby(['adcode', 'model'])['label'].agg(lambda x: np.mean(x))
    data_agg = pd.DataFrame()
    data_agg['pred_label'] = pred
    data_agg['label'] = label
    data_agg['label_mean'] = label_mean
    nrmse_score = []
    for raw in data_agg.values:
        nrmse_score.append(mse(raw[0], raw[1]) ** 0.5 / raw[2])
    return 1 - np.mean(nrmse_score)


df_lgb = pd.DataFrame({'id': test['id']})
for col_add in ['ad_ry_mean', 'md_ry_mean', 'bt_ry_mean']:
    # 取用的字段,用于训练模型
    num_feat = shift_feat
    cate_feat = ['adcode', 'bodyType', 'model', 'regYear', 'regMonth', 'happyNY']
    features = num_feat + cate_feat + ['popularity', 'carCommentVolum', 'newsReplyVolum', 'weightMonth'] + [col_add]  # [ad_ry_mean, md_ry_mean, bt_ry_mean]

    train_idx = (data['mt'] <= 20) # 小于等于20月以内的数据作为训练集
    valid_idx = (data['mt'].between(21, 24)) # 21到24个月的数据作为验证集
    test_idx = (data['mt'] > 24) # 大于24个月的是测试集

    # label
    data['n_label'] = np.log(data['label'])

    train_x = data[train_idx][features]
    train_y = data[train_idx]['n_label']

    valid_x = data[valid_idx][features]
    valid_y = data[valid_idx]['n_label']

    ############################### lgb ###################################
    lgb_model = lgb.LGBMRegressor(
        num_leaves=40, reg_alpha=1, reg_lambda=0.1, objective='mse',
        max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=2019,
        n_estimators=8000, subsample=0.8, colsample_bytree=0.8)

    lgb_model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)],
                  categorical_feature=cate_feat, early_stopping_rounds=100, verbose=300)
    data['pred_label'] = np.e ** lgb_model.predict(data[features])
    model = lgb_model
    # 特征重要程度
    print ('lgb特征重要程度:',sorted(dict(zip(train_x.columns,model.feature_importances_)).items(),key=lambda x: x[1], reverse=True))
    print('NRMSE的均值:',score(data = data[valid_idx]))
    model.n_estimators = model.best_iteration_
    model.fit(data[~test_idx][features], data[~test_idx]['n_label'], categorical_feature=cate_feat)
    data['forecastVolum'] = np.e ** model.predict(data[features])
    sub = data[test_idx][['id']]
    sub['forecastVolum'] = data[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    sub_lgb = sub.reset_index(drop=True)
    sub_lgb = sub_lgb[['id','forecastVolum']]
    print('lgb中forecastVolmn的0值数量:',(sub_lgb['forecastVolum']==0).sum())
    df_lgb[col_add] = sub_lgb['forecastVolum']
df_lgb.to_csv(path + "/Result/df_lgb.csv", index=False) 
# df_lgb有三列值,任一一列提交,上0.57,祝各位好运!!

最后,如果思路和代码对你有帮助,请帮忙点击一下“有用”  ,谢谢啦!!!

点这里呦:https://discussion.datafountain.cn/questions/1901?newAnswer=true

欢迎关注我的公众号!

评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值