stacking过程详解:
老版:
from sklearn.cross_validation import train_test_split
import numpy as np
# from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.cross_validation import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score,roc_auc_score
n_folds = 5
data=load_breast_cancer()
X=data.data
y=data.target
#划分数据集
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
skf = list(StratifiedKFold(y_train, n_folds))
clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100)]
print( "Creating train and test sets for stacking.")
dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))
for j, clf in enumerate(clfs):
print (j, clf)
dataset_blend_test_j = np.zeros((X_test.shape[0], len(skf)))
for i, (fold1, fold2) in enumerate(skf):
print("Fold", i)
# X_train_b = X_train[fold1]
# y_train_b = y_train[fold1]
# X_test_b = X_train[fold2]
# y_test_b = y_train[fold2]
X_train_b,y_train_b,X_test_b,y_test_b = X_train[fold1],y_train[fold1],X_train[fold2],y_train[fold2]
clf.fit(X_train_b, y_train_b)
y_submission = clf.predict_proba(X_test_b)[:, 1]
dataset_blend_train[fold2, j] = y_submission
dataset_blend_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
dataset_blend_test[:, j] = dataset_blend_test_j.mean(axis=1)#按行求平均,相当于把数据框变成一列
print( "Stacking.")
clf = LR()
clf.fit(dataset_blend_train, y_train)
print("融合后 Stacking Accuracy %0.6f:"%accuracy_score(y_test,clf.predict(dataset_blend_test)))
y_test_value=clf.predict_proba(dataset_blend_test)[:,1]
#标准化
y_test_value=(y_test_value-y_test_value.min())/(y_test_value.max()-y_test_value.min())
print('融合后,测试集auc:{:.4}'.format(metrics.roc_auc_score(y_test,y_test_value)))
#对比,用单个模型LR来训练模型
lr=LR(random_state=1)
lr.fit(X_train,y_train)
y_test_label= lr.predict(X_test)
y_test_value= lr.predict_proba(X_test)[:,1]
print("融合前,Accuracy:{:.4}".format(accuracy_score(y_test,y_test_label)))
print("融合前,测试集auc:{:.4}".format(roc_auc_score(y_test,y_test_value)))
新版:
from sklearn.model_selection import train_test_split
import numpy as np
# from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression as LR
from sklearn.metrics import accuracy_score,roc_auc_score
n_folds = 5
data=load_breast_cancer()
X=data.data
y=data.target
print(y)
#划分数据集
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)
skf = StratifiedKFold(n_folds)
clfs = [RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
RandomForestClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='gini'),
ExtraTreesClassifier(n_estimators=100, n_jobs=-1, criterion='entropy'),
GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=100)]
print( "Creating train and test sets for stacking.")
dataset_blend_train = np.zeros((X_train.shape[0], len(clfs)))
dataset_blend_test = np.zeros((X_test.shape[0], len(clfs)))
for j, clf in enumerate(clfs):
print (j, clf)
dataset_blend_test_j = np.zeros((X_test.shape[0], n_folds))
for i, (fold1, fold2) in enumerate(skf.split(X_train, y_train)):
print("Fold", i)
# X_train_b = X_train[fold1]
# y_train_b = y_train[fold1]
# X_test_b = X_train[fold2]
# y_test_b = y_train[fold2]
X_train_b,y_train_b,X_test_b,y_test_b = X_train[fold1],y_train[fold1],X_train[fold2],y_train[fold2]
clf.fit(X_train_b, y_train_b)
y_submission = clf.predict_proba(X_test_b)[:, 1]
dataset_blend_train[fold2, j] = y_submission
dataset_blend_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
dataset_blend_test[:, j] = dataset_blend_test_j.mean(axis=1)#按行求平均,相当于把数据框变成一列
print( "Stacking.")
clf = LR()
clf.fit(dataset_blend_train, y_train)
print("融合后 Stacking Accuracy %0.6f:"%accuracy_score(y_test,clf.predict(dataset_blend_test)))
y_test_value=clf.predict_proba(dataset_blend_test)[:,1]
#标准化
y_test_value=(y_test_value-y_test_value.min())/(y_test_value.max()-y_test_value.min())
print('融合后,测试集auc:{:.4}'.format(roc_auc_score(y_test,y_test_value)))
#对比,用单个模型LR来训练模型
lr=LR(random_state=1)
lr.fit(X_train,y_train)
y_test_label= lr.predict(X_test)
y_test_value= lr.predict_proba(X_test)[:,1]
print("融合前,Accuracy:{:.4}".format(accuracy_score(y_test,y_test_label)))
print("融合前,测试集auc:{:.4}".format(roc_auc_score(y_test,y_test_value)))
Blending与Stacking的区别
Blending与Stacking大致相同,只是Blending的主要区别在于训练集不是通过K-Fold的CV策略来获得预测值从而生成第二阶段模型的特征,而是建立一个Holdout集,例如说10%的训练数据,第二阶段的stacker模型就基于第一阶段模型对这10%训练数据的预测值进行拟合。说白了,就是把Stacking流程中的K-Fold CV 改成 HoldOut CV。
Blending的优点在于:
- 比stacking简单(因为不用进行k次的交叉验证来获得stacker feature)
- 避开了一个信息泄露问题:generlizers和stacker使用了不一样的数据集
- 在团队建模过程中,不需要给队友分享自己的随机种子
而缺点在于:
- 使用了很少的数据(第二阶段的blender只使用training set10%的量)
- blender可能会过拟合(其实大概率是第一点导致的)
- stacking使用多次的CV会比较稳健
对于实践中的结果而言,stacking和blending的效果是差不多的,所以使用哪种方法都没什么所谓,完全取决于个人爱好。
参考:
https://www.cnblogs.com/HongjianChen/p/9706806.html 模型融合之blending和stacking
https://blog.csdn.net/u012735708/article/details/82349731?depth_1-utm_source=distribute.pc_relevant.none-task&utm_source=distribute.pc_relevant.none-task 集成学习-模型融合学习笔记(附Python代码)
https://blog.csdn.net/shine19930820/article/details/75209021#2-%E8%9E%8D%E5%90%88%E7%9A%84%E6%9D%A1%E4%BB%B6 Ensemble Learning-模型融合-Python实现