机器学习之stacking

 

stacking过程详解:

 

 

老版:

from sklearn.cross_validation import train_test_split

import numpy as np
# from sklearn import metrics
from sklearn.cross_validation import train_test_split 
from sklearn.datasets import load_breast_cancer
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score,roc_auc_score
 

 

n_folds = 5

data=load_breast_cancer()
X=data.data
y=data.target
#划分数据集
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

skf = list(StratifiedKFold(y_train, n_folds))

clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100)]

print( "Creating train and test sets for stacking.")

dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))

for j, clf in enumerate(clfs):
    print (j, clf)
    dataset_blend_test_j = np.zeros((X_test.shape[0], len(skf)))
    for i, (fold1, fold2) in enumerate(skf):
        print("Fold", i)
#         X_train_b = X_train[fold1]
#         y_train_b = y_train[fold1]
#         X_test_b = X_train[fold2]
#         y_test_b = y_train[fold2]
        X_train_b,y_train_b,X_test_b,y_test_b = X_train[fold1],y_train[fold1],X_train[fold2],y_train[fold2]
        clf.fit(X_train_b, y_train_b)
        y_submission = clf.predict_proba(X_test_b)[:, 1]
        dataset_blend_train[fold2, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(axis=1)#按行求平均,相当于把数据框变成一列

print( "Stacking.")
clf = LR()
clf.fit(dataset_blend_train, y_train)
print("融合后 Stacking Accuracy %0.6f:"%accuracy_score(y_test,clf.predict(dataset_blend_test)))
y_test_value=clf.predict_proba(dataset_blend_test)[:,1]
#标准化
y_test_value=(y_test_value-y_test_value.min())/(y_test_value.max()-y_test_value.min())
print('融合后,测试集auc:{:.4}'.format(metrics.roc_auc_score(y_test,y_test_value)))

#对比,用单个模型LR来训练模型
lr=LR(random_state=1)
lr.fit(X_train,y_train)
y_test_label= lr.predict(X_test)
y_test_value= lr.predict_proba(X_test)[:,1]
print("融合前,Accuracy:{:.4}".format(accuracy_score(y_test,y_test_label)))
print("融合前,测试集auc:{:.4}".format(roc_auc_score(y_test,y_test_value)))

 新版:

from sklearn.model_selection  import train_test_split

import numpy as np
# from sklearn import metrics
from sklearn.model_selection import train_test_split 
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score,roc_auc_score


n_folds = 5
data=load_breast_cancer()
X=data.data
y=data.target
print(y)
#划分数据集
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

skf = StratifiedKFold(n_folds)

clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
        ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
        GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100)]

print( "Creating train and test sets for stacking.")

dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))

for j, clf in enumerate(clfs):
    print (j, clf)
    dataset_blend_test_j = np.zeros((X_test.shape[0], n_folds))
    for i, (fold1, fold2) in enumerate(skf.split(X_train, y_train)):
        print("Fold", i)
#         X_train_b = X_train[fold1]
#         y_train_b = y_train[fold1]
#         X_test_b = X_train[fold2]
#         y_test_b = y_train[fold2]
        X_train_b,y_train_b,X_test_b,y_test_b = X_train[fold1],y_train[fold1],X_train[fold2],y_train[fold2]
        clf.fit(X_train_b, y_train_b)
        y_submission = clf.predict_proba(X_test_b)[:, 1]
        dataset_blend_train[fold2, j] = y_submission
        dataset_blend_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
    dataset_blend_test[:, j] = dataset_blend_test_j.mean(axis=1)#按行求平均,相当于把数据框变成一列

print( "Stacking.")
clf = LR()
clf.fit(dataset_blend_train, y_train)
print("融合后 Stacking Accuracy %0.6f:"%accuracy_score(y_test,clf.predict(dataset_blend_test)))
y_test_value=clf.predict_proba(dataset_blend_test)[:,1]
#标准化
y_test_value=(y_test_value-y_test_value.min())/(y_test_value.max()-y_test_value.min())
print('融合后,测试集auc:{:.4}'.format(roc_auc_score(y_test,y_test_value)))

#对比,用单个模型LR来训练模型
lr=LR(random_state=1)
lr.fit(X_train,y_train)
y_test_label= lr.predict(X_test)
y_test_value= lr.predict_proba(X_test)[:,1]
print("融合前,Accuracy:{:.4}".format(accuracy_score(y_test,y_test_label)))
print("融合前,测试集auc:{:.4}".format(roc_auc_score(y_test,y_test_value)))

 

Blending与Stacking的区别

Blending与Stacking大致相同,只是Blending的主要区别在于训练集不是通过K-Fold的CV策略来获得预测值从而生成第二阶段模型的特征,而是建立一个Holdout集,例如说10%的训练数据,第二阶段的stacker模型就基于第一阶段模型对这10%训练数据的预测值进行拟合。说白了,就是把Stacking流程中的K-Fold CV 改成 HoldOut CV。

Blending的优点在于:

  1. 比stacking简单(因为不用进行k次的交叉验证来获得stacker feature)
  2. 避开了一个信息泄露问题:generlizers和stacker使用了不一样的数据集
  3. 在团队建模过程中,不需要给队友分享自己的随机种子

而缺点在于:

  1. 使用了很少的数据(第二阶段的blender只使用training set10%的量)
  2. blender可能会过拟合(其实大概率是第一点导致的)
  3. stacking使用多次的CV会比较稳健

对于实践中的结果而言,stacking和blending的效果是差不多的,所以使用哪种方法都没什么所谓,完全取决于个人爱好。

 

参考:

https://www.cnblogs.com/HongjianChen/p/9706806.html   模型融合之blending和stacking

https://blog.csdn.net/u012735708/article/details/82349731?depth_1-utm_source=distribute.pc_relevant.none-task&utm_source=distribute.pc_relevant.none-task  集成学习-模型融合学习笔记(附Python代码)

https://blog.csdn.net/shine19930820/article/details/75209021#2-%E8%9E%8D%E5%90%88%E7%9A%84%E6%9D%A1%E4%BB%B6  Ensemble Learning-模型融合-Python实现

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值