1、对数据进行特征选择:
features=[f for f in data.columns if f not in ['id','issueDate','isDefault']]
# numercial_fea=list(data.select_dtypes(exclude=['object']).columns)
# cotegory_fea=list(filter(lambda x:x not in numercial_fea,list(data.columns)))
#
# print(numercial_fea)
# print(cotegory_fea)
train=data[data.isDefault.notnull()].reset_index(drop=True)
test=data[data.isDefault.isnull()].reset_index(drop=True)
2、对训练集、测试集划分:
x_train=train[features]
x_test=test[features]
y_train=train['isDefault']
y_test=test['isDefault']
3、选择LIGHTGBM模型
#数据格式转换
lgb_train=lgb.Dataset(x_train,y_train)
lgb_eval=lgb.Dataset(x_test,y_test,reference=lgb_train)
boost_round=30#迭代次数
early_stop_rounds=30#验证数据若在early_stop_rounds轮中未提高,则提前停止
params={ 'boosting_type':'gbdt',#设置提升类型
'objective':'binary',#目标函数
'metric':('l2','auc'),#评估函数
'num_leaves':10,#叶子节点数
'learning_rate':0.05,#学习速率
'feature_fraction':0.9,#建树的特征选择比例
'bagging_fraction':0.8,#建树的样本采样比例
'bagging_freq':5,#k,意味着k次执行bagging
'verbose':1#<0,显示致命的,=0,显示错误(警告),>0显示信息
}
#模型训练:加入提前停止功能
results={}
gbm=lgb.train(params,
lgb_train,
num_boost_round=boost_round,
valid_sets=(lgb_eval,lgb_train),
valid_names=('validate','train'),
early_stopping_rounds=early_stop_rounds,
evals_result=results
)
#模型预测
y_pred=gbm.predict(X_test,num_iteration=gbm.best_iteration)
print(y_test)
print(y_pred)
设置参数,对模型进行评估,选择最优参数。