stacking是模型融合的一种方法,参考https://www.sohu.com/a/302683886_787107
def evaluation(model,test_x,test_y,model_type = 'other_model'):
predict = model.predict(test_x)
if model_type == 'lgb':
y_score = np.array(predict)
predict = [1 if x >= 0.5 else 0 for x in predict]
else:
y_score = model.predict_proba(test_x)
fpr = dict()
tpr = dict()
model_auc = dict()
if model_type == 'lgb':
fpr, tpr, thresholds = roc_curve(test_y, y_score, pos_label=1)
else:
fpr, tpr, thresholds = roc_curve(test_y, y_score[:,1], pos_label=1)
model_auc = auc(fpr, tpr)
model_f1_score = f1_score(test_y, predict)
recall = recall_score(test_y,predict, average='micro')
'''
print('predict_auc: %f,predict_f1_score: %f,recall: %f'%(model_auc,model_f1_score,recall))
csfont = {'fontname':'Times New Roman'}
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['axes.labelweight'] = 'bold'
plt.figure()
plt.xlabel('False Positive Rate',**csfont)
plt.ylabel('True Positive Rate',**csfont)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('Classifier ROC',**csfont)
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC area = %0.3f)' % model_auc)
plt.legend(loc="lower right")
plt.show()
'''
return model_auc,model_f1_score,recall
class Lgb_myself(object):
def fit(self,train_x,train_y):
lgb_train = lgb.Dataset(train_x, train_y)
lgb_eval = lgb.Dataset(train_x, train_y, reference=lgb_train)
params = { 'task': 'train',
'boosting_type': 'gbdt',
'application': 'binary',
'metric': {'l2', 'f1'},
'num_leaves': 15,
'learning_rate': 0.05,
'feature_fraction': 0.7,
'bagging_fraction': 0.7,
'bagging_freq': 5,
'verbose': 1,
'verbosity':-1
}
self.final_model = lgb.train(params,lgb_train,num_boost_round=30,valid_sets=lgb_eval,early_stopping_rounds=2)
def predict_proba(self,test_x):
predict = self.final_model.predict(test_x)
return predict
def stacking_model(non_train_x,non_test_x,train_x,test_x,train_y,test_y,selected_var):
train_x = train_x[selected_var]
test_x = test_x[selected_var]
lr = LogisticRegressionCV(penalty='l2',class_weight = 'balanced',cv = 5,Cs = 20,solver = 'liblinear')
rf = RandomForestClassifier(max_features='sqrt',min_samples_split = 15,max_depth = 10,class_weight = 'balanced',
n_estimators = 100,min_samples_leaf = 25)
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8, min_samples_split=15, min_samples_leaf=25),
n_estimators=100, learning_rate=0.05)
lgb = Lgb_myself()
stack_model = [lr,rf,bdt,lgb]
n = len(stack_model)
ntrain = train_x.shape[0]
ntest = test_x.shape[0]
train_stack = np.zeros((ntrain,n))
test_stack = np.zeros((ntest,n))
non_train_stack = np.zeros((ntrain,n))
non_test_stack = np.zeros((ntest,n))
kf = KFold(5)
train_vars = list(non_train_x.columns)
train_x = np.array(train_x)
test_x = np.array(test_x)
non_train_x = np.array(non_train_x)
non_test_x = np.array(non_test_x)
train_y = np.array(train_y)
for i,model in enumerate(stack_model):
non_tmp_stack_test = np.zeros((ntest,5))
tmp_stack_test = np.zeros((ntest,5))
for j,(train_fold,validata) in enumerate(kf.split(train_x,train_y)):
kf_train,kf_validata,label_train,label_validata = \
train_x[train_fold,:],train_x[validata,:],train_y[train_fold],train_y[validata]
model.fit(kf_train,label_train)
if i <3:
non_train_stack[validata,i] = model.predict_proba(kf_validata)[:,1]
train_stack[validata,i] = model.predict_proba(kf_validata)[:,1]
tmp_stack_test[:,j] = model.predict_proba(test_x)[:,1]
non_tmp_stack_test[:,j] = model.predict_proba(test_x)[:,1]
else:
train_stack[validata,i] = model.predict_proba(kf_validata)
non_train_stack[validata,i] = model.predict_proba(kf_validata)
tmp_stack_test[:,j] = model.predict_proba(test_x)
non_tmp_stack_test[:,j] = model.predict_proba(test_x)
test_stack[:,i] = np.mean(tmp_stack_test,axis = 1)
non_test_stack[:,i] = np.mean(non_tmp_stack_test,axis = 1)
#train_stack[:,-1] = add_onecolumns(train_stack[:,:-1])
#test_stack[:,-1] = add_onecolumns(test_stack[:,:-1])
lr_model = LogisticRegressionCV(penalty='l2',class_weight = 'balanced',cv = 3,Cs = 20,solver = 'liblinear')
lr_model.fit(train_stack,train_y)
train = np.concatenate((non_train_x,non_train_stack,train_y.reshape(-1,1),lr_model.predict_proba(train_stack)[:,1].reshape(-1,1)),axis = 1)
vars_name = train_vars+['lr_pre','rf_pre','adaboost_pre','lgb_pre']+['label']
train = pd.DataFrame(train,columns = vars_name+['stacking_pre'])
train_auc,train_f1_score,train_recall = evaluation(lr_model,train_stack,train_y,'lr')
print(train_auc,train_f1_score,train_recall)
model_auc,model_f1_score,recall = evaluation(lr_model,test_stack,test_y,'lr')
test_y = np.array(test_y)
test = np.concatenate((non_test_x,non_test_stack,test_y.reshape(-1,1),lr_model.predict_proba(test_stack)[:,1].reshape(-1,1)),axis = 1)
test = pd.DataFrame(test,columns = vars_name+['stacking_pre'])
#train.to_excel('融合模型train2.xlsx',index = None)
#test.to_excel('融合模型test2.xlsx',index = None)
print(test_stack[:10])
print(test_y[:10])
print(lr_model.predict_proba(test_stack)[:,1][:10])
return model_auc,model_f1_score,recall,lr_model
non_train_x是没有进行标准化的训练集,train_x是对连续变量进行了标准化后的训练集。