Datawhale打卡活动Day7

最新推荐文章于 2024-07-21 23:25:11 发布

麻辣香郭诶

最新推荐文章于 2024-07-21 23:25:11 发布

阅读量390

点赞数 1

分类专栏： Kaggle Spaceship Titanic打卡活动文章标签： python 机器学习人工智能

本文链接：https://blog.csdn.net/qq_52171945/article/details/127163647

版权

Kaggle Spaceship Titanic打卡活动专栏收录该内容

7 篇文章 3 订阅

订阅专栏

Datawhale打卡活动 Kaggle Spaceship Titanic

尝试了一个coggle科学的打卡活动（Coggle 30 Days of ML（22年10月）），记录一下学习过程！

Day7 多折训练与集成

步骤1：使用KFold完成数据划分；

KFold在原来的博客中也有相关的介绍，不清楚的可以查阅网上的相关资料（推荐），或者翻一下笔者写的博客（也是网上copy下来的）。直接进入实战部分。

def cv_model(clf, train_x, train_y, test_x):
    folds = 5
    seed = 42
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    test_pre = []
    Feass = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(kf.split(train_x)):
        print('********************* {} *********************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        
        
        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        fea = pd.DataFrame()

        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'metric': 'auc',
            'n_jobs': 30,
            'learning_rate': 0.05,
            'num_leaves': 2 ** 6,
            'max_depth': 8,
            'tree_learner': 'serial',
            'colsample_bytree': 0.8,
            'subsample_freq': 1,
            'subsample': 0.8,
            'num_boost_round': 5000,
            'max_bin': 255,
            'verbose': -1,
            'seed': 2021,
            'bagging_seed': 2021,
            'feature_fraction_seed': 2021,
            'early_stopping_rounds': 200,
        }

        model = clf.train(params, train_matrix, num_boost_round=2000, valid_sets=[train_matrix, valid_matrix],categorical_feature =[] ,verbose_eval=200,early_stopping_rounds=400)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        test_pre.append(test_pred)
        fea['feas'] = train_x.columns.tolist()
        fea['sorce'] = model.feature_importance()
        Feass = pd.concat([Feass,fea],axis = 0)
        print(list(sorted(zip(train_x.columns.tolist(), model.feature_importance()), key=lambda x: x[1], reverse=True))[:20])
            
            
        train[valid_index] = val_pred
        test = test_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        
        print(cv_scores)
        
    test = sum(test_pre) / folds
    print("scotrainre_list:" , cv_scores)
    print("score_mean:" ,np.mean(cv_scores))
    print("score_std:" ,np.std(cv_scores))
    
    return train, test, Feass

步骤2：使用StratifiedKFold完成数据划分；

def cv_model(clf, train_x, train_y, test_x):
    folds = 5
    seed = 42
    #kf = KFold(n_splits=folds, shuffle=True, random_state=seed)
    kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    test_pre = []
    Feass = pd.DataFrame()

    for i, (train_index, valid_index) in enumerate(kf.split(train_x,train_y)):
        print('********************* {} *********************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        
        
        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        fea = pd.DataFrame()

        params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'metric': 'auc',
            'n_jobs': 30,
            'learning_rate': 0.05,
            'num_leaves': 2 ** 6,
            'max_depth': 8,
            'tree_learner': 'serial',
            'colsample_bytree': 0.8,
            'subsample_freq': 1,
            'subsample': 0.8,
            'num_boost_round': 5000,
            'max_bin': 255,
            'verbose': -1,
            'seed': 2021,
            'bagging_seed': 2021,
            'feature_fraction_seed': 2021,
            'early_stopping_rounds': 200,
        }

        model = clf.train(params, train_matrix, num_boost_round=2000, valid_sets=[train_matrix, valid_matrix],categorical_feature =[] ,verbose_eval=200,early_stopping_rounds=400)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)
        test_pre.append(test_pred)
        fea['feas'] = train_x.columns.tolist()
        fea['sorce'] = model.feature_importance()
        Feass = pd.concat([Feass,fea],axis = 0)
        print(list(sorted(zip(train_x.columns.tolist(), model.feature_importance()), key=lambda x: x[1], reverse=True))[:20])
            
            
        train[valid_index] = val_pred
        test = test_pred
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        
        print(cv_scores)
        
    test = sum(test_pre) / folds
    print("scotrainre_list:" , cv_scores)
    print("score_mean:" ,np.mean(cv_scores))
    print("score_std:" ,np.std(cv_scores))
    
    return train, test, Feass

步骤3：使用StratifiedKFold配合LightGBM完成模型的训练和预测

训练结果如下：

步骤4：在步骤3训练得到了多少个模型，对测试集多次预测，将最新预测的结果文件提交到比赛，截图分数；

在步骤3中，我们使用了5折交叉验证，也就是得到了5个模型，对测试集进行了5次预测，然后求均值作为最后的结果。

线上分数如下：

步骤5：使用交叉验证训练5个机器学习模型（svm、lr等），使用stacking完成集成，将最新预测的结果文件提交到比赛，截图分数；

写一个五折交叉验证的函数，方便训练：

def model_train(model, model_name, kfold=5):
    oof_preds = np.zeros((train.shape[0]))
    test_preds = np.zeros(test.shape[0])
    skf = StratifiedKFold(n_splits=kfold)
    print(f"Model = {model_name}")
    for k, (train_index, test_index) in enumerate(skf.split(train, label)):
        x_train, x_test = train.iloc[train_index, :], train.iloc[test_index, :]
        y_train, y_test = label.iloc[train_index], label.iloc[test_index]

        model.fit(x_train,y_train)

        y_pred = model.predict_proba(x_test)[:,1]
        oof_preds[test_index] = y_pred.ravel()
        auc = roc_auc_score(y_test,y_pred)
        print("- KFold = %d, val_auc = %.4f" % (k, auc))
        test_fold_preds = model.predict_proba(test)[:, 1]
        test_preds += test_fold_preds.ravel()
    print("Overall Model = %s, AUC = %.4f" % (model_name, roc_auc_score(label, oof_preds)))
    return test_preds / kfold

stacking部分：

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import StratifiedKFold, KFold

lr = LogisticRegression(random_state=2022,tol=1e-6)  # 逻辑回归模型
tree = DecisionTreeClassifier(random_state=2022) #决策树模型
svm = SVC(probability=True,random_state=2022,tol=1e-6)  # SVM模型
forest=RandomForestClassifier(n_estimators=100,random_state=2022) #　随机森林
Gbdt=GradientBoostingClassifier(random_state=2022) #GBDT

estimators = [
    ('lr', lr),
    ('hgbc', tree),
    ('xgbc', svm),
    ('gbm', forest),
    ('cbc', Gbdt)
]
clf = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression()
)