特征优化结束后,使用分类预测客户是否是异常客户并对两类客户分别进行回归预测
为了方便更快速的调用三种不同的模型,并且同时要求能够完成分类和回归预测,此处通过定义一个函数来完成所有模型的训练过程。
def train_model(X, X_test, y, params, folds, model_type='lgb', eval_type='regression'): #用后两个参数决定模型和解决的问题类型
oof = np.zeros(X.shape[0])
predictions = np.zeros(X_test.shape[0])
scores = []
for fold_n, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
print('Fold', fold_n, 'started at', time.ctime())
if model_type == 'lgb':
trn_data = lgb.Dataset(X[trn_idx], y[trn_idx])
val_data = lgb.Dataset(X[val_idx], y[val_idx])
clf = lgb.train(params, trn_data, num_boost_round=20000,
valid_sets=[trn_data, val_data],
verbose_eval=100, early_stopping_rounds=300)
oof[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)
predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
if model_type == 'xgb':
trn_data = xgb.DMatrix(X[trn_idx], y[trn_idx])
val_data = xgb.DMatrix(X[val_idx], y[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
clf = xgb.train(dtrain=trn_data, num_boost_round=20000,
evals=watchlist, early_stopping_rounds=200,
verbose_eval=100, params=params)
oof[val_idx] = clf.predict(xgb.DMatrix(X[val_idx]), ntree_limit=clf.best_ntree_limit)
predictions += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
if (model_type == 'cat') and (eval_type == 'regression'):
clf = CatBoostRegressor(iterations=20000, eval_metric='RMSE', **params)
clf.fit(X[trn_idx], y[trn_idx],
eval_set=(X[val_idx], y[val_idx]),
cat_features=[], use_best_model=True, verbose=100)
oof[val_idx] = clf.predict(X[val_idx])
predictions += clf.predict(X_test) / folds.n_splits
if (model_type == 'cat') and (eval_type == 'binary'):
clf = CatBoostClassifier(iterations=20000, eval_metric='Logloss', **params)
clf.fit(X[trn_idx], y[trn_idx],
eval_set=(X[val_idx], y[val_idx]),
cat_features=[], use_best_model=True, verbose=100)
oof[val_idx] = clf.predict_proba(X[val_idx])[:,1]
predictions += clf.predict_proba(X_test)[:,1] / folds.n_splits
print(predictions)
if eval_type == 'regression':
scores.append(mean_squared_error(oof[val_idx], y[val_idx])**0.5)
if eval_type == 'binary':
scores.append(log_loss(y[val_idx], oof[val_idx]))
print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
return oof, predictions, scores
下面用上面的模型构建三个子模型
lgb_params = {'num_leaves': 63,
'min_data_in_leaf': 32,
'objective':'regression',
'max_depth': -1,
'learning_rate': 0.01,
\ min_child_samples\ : 20,
\ boosting\ : \ gbdt\ ,
\ feature_fraction\ : 0.9,
\ bagging_freq\ : 1,
\ bagging_fraction\ : 0.9 ,
\ bagging_seed\ : 11,
\ metric\ : 'rmse',
\ lambda_l1\ : 0.1,
\ verbosity\ : -1}
folds = KFold(n_splits=5, shuffle=True, random_state=4096)
X_ntrain = ntrain[fea_cols].values
X_train = train[fea_cols].values
X_test = test[fea_cols].values
print('='*10,'回归模型','='*10)
oof_lgb , predictions_lgb , scores_lgb = train_model(X_train , X_test, y_train, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
#围绕不存在异常的数据做预测
print('='*10,'without outliers 回归模型','='*10)
oof_nlgb, predictions_nlgb, scores_nlgb = train_model(X_ntrain, X_test, y_ntrain, params=lgb_params, folds=folds, model_type='lgb', eval_type='regression')
#围绕异常的数据做预测
print('='*10,'分类模型','='*10)
lgb_params['objective'] = 'binary'
lgb_params['metric'] = 'binary_logloss'
oof_blgb, predictions_blgb, scores_blgb = train_model(X_train , X_test, y_train_binary, params=lgb_params, folds=folds, model_type='lgb', eval_type='binary')
#样本分类模型
然后将所有预测结果进行保存,包括一个分类模型、以及两个回归模型:
sub_df = pd.read_csv('data/sample_submission.csv')
sub_df[\ target\ ] = predictions_lgb
sub_df.to_csv('predictions_lgb.csv', index=False)
oof_lgb = pd.DataFrame(oof_lgb)
oof_nlgb = pd.DataFrame(oof_nlgb)
oof_blgb = pd.DataFrame(oof_blgb)
predictions_lgb = pd.DataFrame(predictions_lgb)
predictions_nlgb = pd.DataFrame(predictions_nlgb)
predictions_blgb = pd.DataFrame(predictions_blgb)
oof_lgb.to_csv('./result/oof_lgb.csv',header=None,index=False)
oof_blgb.to_csv('./result/oof_blgb.csv',header=None,index=False)
oof_nlgb.to_csv('./result/oof_nlgb.csv',header=None,index=False)
predictions_lgb.to_csv('./result/predictions_lgb.csv',header=None,index=False)
predictions_nlgb.to_csv('./result/predictions_nlgb.csv',header=None,index=False)
predictions_blgb.to_csv('./result/predictions_blgb.csv',header=None,index=False)
#发现但模型情况下私榜3.61,公榜3.68,特征优化的效果十分明显
下面开始二阶段建模
sub_df = pd.read_csv('data/sample_submission.csv')
sub_df[\ target\ ] = predictions_bstack*-33.219281 + (1-predictions_bstack)*predictions_nstack #异常值直接*-33.22
sub_df.to_csv('predictions_trick.csv', index=False)
如果担心异常值识别不一定准确,可以把上面的结果*0.5,再把直接求得的结果*0。5进行融合
sub_df = pd.read_csv('data/sample_submission.csv')
sub_df[\ target\ ] = (predictions_bstack*-33.219281 + (1-predictions_bstack)*predictions_nstack)*0.5 + predictions_stack*0.5
sub_df.to_csv('predictions_trick&stacking.csv', index=False)
未来展望方向:
1、每个阶段都用stacking做模型融合
2、每个模型都用交叉验证