Datawhale打卡活动Day7

Datawhale打卡活动 Kaggle Spaceship Titanic

尝试了一个coggle科学的打卡活动(Coggle 30 Days of ML(22年10月)),记录一下学习过程!

Day7 多折训练与集成

步骤1:使用KFold完成数据划分;

KFold在原来的博客中也有相关的介绍,不清楚的可以查阅网上的相关资料(推荐),或者翻一下笔者写的博客(也是网上copy下来的)。直接进入实战部分。

def cv_model(clf, train_x, train_y, test_x):
    folds = 5
    seed = 42
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    test_pre = []
    Feass = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(kf.split(train_x)):
        print('********************* {} *********************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        
        
        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        fea = pd.DataFrame()

        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'metric': 'auc',
            'n_jobs': 30,
            'learning_rate': 0.05,
            'num_leaves': 2 ** 6,
            'max_depth': 8,
            'tree_learner': 'serial',
            'colsample_bytree': 0.8,
            'subsample_freq': 1,
            'subsample': 0.8,
            'num_boost_round': 5000,
            'max_bin': 255,
            'verbose': -1,
            'seed': 2021,
            'bagging_seed': 2021,
            'feature_fraction_seed': 2021,
            'early_stopping_rounds': 200,
        }

        model = clf.train(params, train_matrix, num_boost_round=2000, valid_sets=[train_matrix, valid_matrix],categorical_feature =[] ,verbose_eval=200,early_stopping_rounds=400)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        test_pre.append(test_pred)
        fea['feas'] = train_x.columns.tolist()
        fea['sorce'] = model.feature_importance()
        Feass = pd.concat([Feass,fea],axis = 0)
        print(list(sorted(zip(train_x.columns.tolist(), model.feature_importance()), key=lambda x: x[1], reverse=True))[:20])
            
            
        train[valid_index] = val_pred
        test = test_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        
        print(cv_scores)
        
    test = sum(test_pre) / folds
    print("scotrainre_list:" , cv_scores)
    print("score_mean:" ,np.mean(cv_scores))
    print("score_std:" ,np.std(cv_scores))
    
    return train, test, Feass

步骤2:使用StratifiedKFold完成数据划分;

def cv_model(clf, train_x, train_y, test_x):
    folds = 5
    seed = 42
    #kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    test_pre = []
    Feass = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(kf.split(train_x,train_y)):
        print('********************* {} *********************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        
        
        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        fea = pd.DataFrame()

        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'metric': 'auc',
            'n_jobs': 30,
            'learning_rate': 0.05,
            'num_leaves': 2 ** 6,
            'max_depth': 8,
            'tree_learner': 'serial',
            'colsample_bytree': 0.8,
            'subsample_freq': 1,
            'subsample': 0.8,
            'num_boost_round': 5000,
            'max_bin': 255,
            'verbose': -1,
            'seed': 2021,
            'bagging_seed': 2021,
            'feature_fraction_seed': 2021,
            'early_stopping_rounds': 200,
        }

        model = clf.train(params, train_matrix, num_boost_round=2000, valid_sets=[train_matrix, valid_matrix],categorical_feature =[] ,verbose_eval=200,early_stopping_rounds=400)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        test_pre.append(test_pred)
        fea['feas'] = train_x.columns.tolist()
        fea['sorce'] = model.feature_importance()
        Feass = pd.concat([Feass,fea],axis = 0)
        print(list(sorted(zip(train_x.columns.tolist(), model.feature_importance()), key=lambda x: x[1], reverse=True))[:20])
            
            
        train[valid_index] = val_pred
        test = test_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        
        print(cv_scores)
        
    test = sum(test_pre) / folds
    print("scotrainre_list:" , cv_scores)
    print("score_mean:" ,np.mean(cv_scores))
    print("score_std:" ,np.std(cv_scores))
    
    return train, test, Feass

步骤3:使用StratifiedKFold配合LightGBM完成模型的训练和预测

训练结果如下:

image-20221004143536636

步骤4:在步骤3训练得到了多少个模型,对测试集多次预测,将最新预测的结果文件提交到比赛,截图分数;

在步骤3中,我们使用了5折交叉验证,也就是得到了5个模型,对测试集进行了5次预测,然后求均值作为最后的结果。

线上分数如下:

image-20221004143813479

步骤5:使用交叉验证训练5个机器学习模型(svm、lr等),使用stacking完成集成,将最新预测的结果文件提交到比赛,截图分数;

写一个五折交叉验证的函数,方便训练:

def model_train(model, model_name, kfold=5):
    oof_preds = np.zeros((train.shape[0]))
    test_preds = np.zeros(test.shape[0])
    skf = StratifiedKFold(n_splits=kfold)
    print(f"Model = {model_name}")
    for k, (train_index, test_index) in enumerate(skf.split(train, label)):
        x_train, x_test = train.iloc[train_index, :], train.iloc[test_index, :]
        y_train, y_test = label.iloc[train_index], label.iloc[test_index]

        model.fit(x_train,y_train)

        y_pred = model.predict_proba(x_test)[:,1]
        oof_preds[test_index] = y_pred.ravel()
        auc = roc_auc_score(y_test,y_pred)
        print("- KFold = %d, val_auc = %.4f" % (k, auc))
        test_fold_preds = model.predict_proba(test)[:, 1]
        test_preds += test_fold_preds.ravel()
    print("Overall Model = %s, AUC = %.4f" % (model_name, roc_auc_score(label, oof_preds)))
    return test_preds / kfold

stacking部分:

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, KFold

lr = LogisticRegression(random_state=2022,tol=1e-6)  # 逻辑回归模型
tree = DecisionTreeClassifier(random_state=2022) #决策树模型
svm = SVC(probability=True,random_state=2022,tol=1e-6)  # SVM模型
forest=RandomForestClassifier(n_estimators=100,random_state=2022) # 随机森林
Gbdt=GradientBoostingClassifier(random_state=2022) #GBDT

estimators = [
    ('lr', lr),
    ('hgbc', tree),
    ('xgbc', svm),
    ('gbm', forest),
    ('cbc', Gbdt)
]
clf = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression()
)

线上分数:

image-20221004164434016

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值