python实现stacking

最新推荐文章于 2023-09-26 13:22:09 发布

sinat_36190649

最新推荐文章于 2023-09-26 13:22:09 发布

阅读量558

点赞数

分类专栏： python 文章标签： python 机器学习

本文链接：https://blog.csdn.net/sinat_36190649/article/details/108454192

版权

python 专栏收录该内容

10 篇文章 1 订阅

订阅专栏

stacking是模型融合的一种方法，参考https://www.sohu.com/a/302683886_787107

def evaluation(model,test_x,test_y,model_type = 'other_model'):
    predict = model.predict(test_x)
    if model_type == 'lgb':
        y_score = np.array(predict)
        predict = [1 if x >= 0.5 else 0 for x in predict]
    else:
        y_score = model.predict_proba(test_x)
    fpr = dict()
    tpr = dict()
    model_auc = dict()
    if model_type == 'lgb':
        fpr, tpr, thresholds = roc_curve(test_y, y_score, pos_label=1)
    else:
        fpr, tpr, thresholds = roc_curve(test_y, y_score[:,1], pos_label=1)
    model_auc = auc(fpr, tpr)
    model_f1_score = f1_score(test_y, predict)
    recall = recall_score(test_y,predict, average='micro')
    '''
    print('predict_auc: %f,predict_f1_score: %f,recall: %f'%(model_auc,model_f1_score,recall))
    csfont = {'fontname':'Times New Roman'}
    plt.rcParams['axes.labelsize'] = 14
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.figure()
    plt.xlabel('False Positive Rate',**csfont)
    plt.ylabel('True Positive Rate',**csfont)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.title('Classifier ROC',**csfont)
    plt.plot(fpr, tpr, color='blue', lw=2, label='ROC area = %0.3f)' % model_auc)
    plt.legend(loc="lower right")
    plt.show()
    '''
    return model_auc,model_f1_score,recall

class Lgb_myself(object):
    def fit(self,train_x,train_y):
        lgb_train = lgb.Dataset(train_x, train_y)
        lgb_eval = lgb.Dataset(train_x, train_y, reference=lgb_train)
        params = {  'task': 'train',
                    'boosting_type': 'gbdt',
                    'application': 'binary',
                    'metric': {'l2', 'f1'},
                    'num_leaves': 15,
                    'learning_rate': 0.05,
                    'feature_fraction': 0.7,
                    'bagging_fraction': 0.7,
                    'bagging_freq': 5,
                    'verbose': 1,
                    'verbosity':-1
                 }
        self.final_model = lgb.train(params,lgb_train,num_boost_round=30,valid_sets=lgb_eval,early_stopping_rounds=2)
    def predict_proba(self,test_x):
        predict = self.final_model.predict(test_x)
        return predict

def stacking_model(non_train_x,non_test_x,train_x,test_x,train_y,test_y,selected_var):
    train_x = train_x[selected_var]
    test_x = test_x[selected_var]
    lr = LogisticRegressionCV(penalty='l2',class_weight = 'balanced',cv = 5,Cs = 20,solver = 'liblinear')
    rf = RandomForestClassifier(max_features='sqrt',min_samples_split = 15,max_depth = 10,class_weight = 'balanced',
                                        n_estimators = 100,min_samples_leaf = 25)
    bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=8, min_samples_split=15, min_samples_leaf=25),
                         n_estimators=100, learning_rate=0.05)
    lgb = Lgb_myself()
    stack_model = [lr,rf,bdt,lgb]
    n = len(stack_model)
    ntrain = train_x.shape[0]
    ntest = test_x.shape[0]
    train_stack = np.zeros((ntrain,n))
    test_stack = np.zeros((ntest,n))
    non_train_stack = np.zeros((ntrain,n))
    non_test_stack = np.zeros((ntest,n))
    kf = KFold(5)
    train_vars = list(non_train_x.columns)
    train_x = np.array(train_x)
    test_x = np.array(test_x)
    non_train_x = np.array(non_train_x)
    non_test_x = np.array(non_test_x)
    train_y = np.array(train_y)
    for i,model in enumerate(stack_model):
        non_tmp_stack_test = np.zeros((ntest,5))
        tmp_stack_test = np.zeros((ntest,5))
        for j,(train_fold,validata) in enumerate(kf.split(train_x,train_y)):
            kf_train,kf_validata,label_train,label_validata = \
            train_x[train_fold,:],train_x[validata,:],train_y[train_fold],train_y[validata]
            model.fit(kf_train,label_train)
            if i <3:
                non_train_stack[validata,i] = model.predict_proba(kf_validata)[:,1]
                train_stack[validata,i] = model.predict_proba(kf_validata)[:,1]
                tmp_stack_test[:,j] = model.predict_proba(test_x)[:,1] 
                non_tmp_stack_test[:,j] = model.predict_proba(test_x)[:,1] 
            else:
                train_stack[validata,i] = model.predict_proba(kf_validata)
                non_train_stack[validata,i] = model.predict_proba(kf_validata)
                tmp_stack_test[:,j] = model.predict_proba(test_x)
                non_tmp_stack_test[:,j] = model.predict_proba(test_x)
        test_stack[:,i] = np.mean(tmp_stack_test,axis = 1)
        non_test_stack[:,i] = np.mean(non_tmp_stack_test,axis = 1)
    #train_stack[:,-1] = add_onecolumns(train_stack[:,:-1])
    #test_stack[:,-1] = add_onecolumns(test_stack[:,:-1])
    lr_model = LogisticRegressionCV(penalty='l2',class_weight = 'balanced',cv = 3,Cs = 20,solver = 'liblinear')
    lr_model.fit(train_stack,train_y)
    train = np.concatenate((non_train_x,non_train_stack,train_y.reshape(-1,1),lr_model.predict_proba(train_stack)[:,1].reshape(-1,1)),axis = 1)
    vars_name = train_vars+['lr_pre','rf_pre','adaboost_pre','lgb_pre']+['label']
    train = pd.DataFrame(train,columns = vars_name+['stacking_pre'])
    train_auc,train_f1_score,train_recall = evaluation(lr_model,train_stack,train_y,'lr')
    print(train_auc,train_f1_score,train_recall)
    model_auc,model_f1_score,recall = evaluation(lr_model,test_stack,test_y,'lr')
    test_y = np.array(test_y)
    test = np.concatenate((non_test_x,non_test_stack,test_y.reshape(-1,1),lr_model.predict_proba(test_stack)[:,1].reshape(-1,1)),axis = 1)
    test = pd.DataFrame(test,columns = vars_name+['stacking_pre'])
    #train.to_excel('融合模型train2.xlsx',index = None)
    #test.to_excel('融合模型test2.xlsx',index = None)
    print(test_stack[:10])
    print(test_y[:10])
    print(lr_model.predict_proba(test_stack)[:,1][:10])
    return model_auc,model_f1_score,recall,lr_model

non_train_x是没有进行标准化的训练集，train_x是对连续变量进行了标准化后的训练集。

sinat_36190649

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python实现stacking

stacking是模型融合的一种方法，参考https://www.sohu.com/a/302683886_787107def evaluation(model,test_x,test_y,model_type = 'other_model'): predict = model.predict(test_x) if model_type == 'lgb': y_score = np.array(predict) predict = [1 if x &gt
复制链接

扫一扫