集成学习-Task13

Stacking集成学习算法

from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier 
from mlxtend.classifier import StackingCVClassifier
import matplotlib.pyplot as plt
iris=load_iris()
X,y=iris.data[:,1:3],iris.target
RANDOM_SEED=42
# 建立基评估器
clf1=KNeighborsClassifier(3)
clf2=RandomForestClassifier(n_estimators=100,random_state=RANDOM_SEED)
clf3=GaussianNB()
lr=LogisticRegression()
# 建立两层评估器
sclf=StackingCVClassifier(classifiers=[clf1,clf2,clf3]# 第一层分类器
,meta_classifier=lr#第二层分类器
,random_state=RANDOM_SEED)
print('3-fold cross validation:\n')
for clf,label in zip([clf1,clf2,clf3,sclf],['KNN','RFC','Naive Bayes','StackingClassifier']):
    scores=cross_val_score(clf,X,y,cv=3,scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]"% (scores.mean(), scores.std(), label))
3-fold cross validation:

Accuracy: 0.9733 (+/- 0.0094) [KNN]
Accuracy: 0.9667 (+/- 0.0189) [RFC]
Accuracy: 0.9400 (+/- 0.0163) [Naive Bayes]
Accuracy: 0.9667 (+/- 0.0189) [StackingClassifier]
# 绘制决策边界
from mlxtend.plotting import plot_decision_regions
import matplotlib.gridspec as gridspec
import itertools
gs=gridspec.GridSpec(2,2)
fig=plt.figure(figsize=(10,8))
for clf,lab,grd in zip([clf1,clf2,clf3,sclf],['KNN','RFC','Naive Bayes','StackingClassifier'],itertools.product([0,1],repeat=2)):
    clf.fit(X,y)
    ax=plt.subplot(gs[grd[0],grd[1]])
    fig=plot_decision_regions(X=X,y=y,clf=clf)
    plt.title(lab)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qazY5SMm-1620908906611)(output_7_0.svg)]

  • 绘制决策边界只能是二维

修改第一层基分类器的输出

若use_probas = True,average_probas = True

iris=load_iris()
X,y=iris.data,iris.target
clf1=KNeighborsClassifier(3)
clf2=RandomForestClassifier(n_estimators=100,random_state=RANDOM_SEED)
clf3=GaussianNB()
lr=LogisticRegression()
# 建立两层评估器
sclf=StackingCVClassifier(classifiers=[clf1,clf2,clf3]# 第一层分类器
,meta_classifier=lr#第二层分类器
,use_probas=True
,random_state=RANDOM_SEED)
print('3-fold cross validation:\n')
for clf,label in zip([clf1,clf2,clf3,sclf],['KNN','RFC','Naive Bayes','StackingClassifier']):
    scores=cross_val_score(clf,X,y,cv=3,scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]"% (scores.mean(), scores.std(), label))
3-fold cross validation:

Accuracy: 0.9733 (+/- 0.0094) [KNN]
Accuracy: 0.9667 (+/- 0.0189) [RFC]
Accuracy: 0.9400 (+/- 0.0163) [Naive Bayes]
Accuracy: 0.9667 (+/- 0.0189) [StackingClassifier]

若use_probas = True,average_probas = False

clf1=KNeighborsClassifier(3)
clf2=RandomForestClassifier(n_estimators=100,random_state=RANDOM_SEED)
clf3=GaussianNB()
lr=LogisticRegression()
# 建立两层评估器
sclf=StackingCVClassifier(classifiers=[clf1,clf2,clf3]# 第一层分类器
,meta_classifier=lr#第二层分类器
,use_probas=True
,random_state=RANDOM_SEED)
print('3-fold cross validation:\n')
for clf,label in zip([clf1,clf2,clf3,sclf],['KNN','RFC','Naive Bayes','StackingClassifier']):
    scores=cross_val_score(clf,X,y,cv=3,scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]"% (scores.mean(), scores.std(), label))
3-fold cross validation:

Accuracy: 0.9733 (+/- 0.0094) [KNN]
Accuracy: 0.9667 (+/- 0.0189) [RFC]
Accuracy: 0.9400 (+/- 0.0163) [Naive Bayes]
Accuracy: 0.9667 (+/- 0.0189) [StackingClassifier]

堆叠5折CV分类与网格搜索(结合网格搜索调参优化)

from sklearn.model_selection import GridSearchCV
clf1=KNeighborsClassifier(1)
clf2=RandomForestClassifier(n_estimators=100,random_state=RANDOM_SEED)
clf3=GaussianNB()
lr=LogisticRegression()
# 建立两层评估器
sclf=StackingCVClassifier(classifiers=[clf1,clf2,clf3]# 第一层分类器
,meta_classifier=lr#第二层分类器
,random_state=RANDOM_SEED)
params = {'kneighborsclassifier__n_neighbors': [1, 5],
'randomforestclassifier__n_estimators': [10, 100],
'meta_classifier__C': [0.1, 10.0]}
grid = GridSearchCV(estimator=sclf
,param_grid=params
,cv=5
,refit=True
)
grid.fit(X, y)
cv_keys = ('mean_test_score', 'std_test_score', 'params')
for r, _ in enumerate(grid.cv_results_['mean_test_score']):
    print("%0.4f +/- %0.4f %r"% (grid.cv_results_[cv_keys[0]][r]
    ,grid.cv_results_[cv_keys[1]][r] /2.0
    ,grid.cv_results_[cv_keys[2]][r]))
print('Best parameters: %s'%grid.best_params_)
print('Accuracy: %.4f'%grid.best_score_)
0.9667 +/- 0.0105 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}
0.9600 +/- 0.0125 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 100}
0.9600 +/- 0.0125 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}
0.9533 +/- 0.0170 {'kneighborsclassifier__n_neighbors': 1, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 100}
0.9667 +/- 0.0105 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 10}
0.9667 +/- 0.0105 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 0.1, 'randomforestclassifier__n_estimators': 100}
0.9733 +/- 0.0125 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}
0.9667 +/- 0.0183 {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 100}
Best parameters: {'kneighborsclassifier__n_neighbors': 5, 'meta_classifier__C': 10.0, 'randomforestclassifier__n_estimators': 10}
Accuracy: 0.9733

在不同特征子集上运行的分类器的stacking

from sklearn.pipeline import make_pipeline
from mlxtend.feature_selection import ColumnSelector
pipe1=make_pipeline(ColumnSelector(cols=(0,2)),
LogisticRegression())
pipe2=make_pipeline(ColumnSelector(cols=(1,2,3)),
LogisticRegression())
sclf_1 = StackingCVClassifier(classifiers=[pipe1, pipe2],meta_classifier=LogisticRegression(),random_state=42)
sclf.fit(X,y)
StackingCVClassifier(classifiers=[Pipeline(steps=[('columnselector',
                                                   ColumnSelector(cols=(0, 2))),
                                                  ('logisticregression',
                                                   LogisticRegression())]),
                                  Pipeline(steps=[('columnselector',
                                                   ColumnSelector(cols=(1, 2,
                                                                        3))),
                                                  ('logisticregression',
                                                   LogisticRegression())])],
                     meta_classifier=LogisticRegression(), random_state=42)
for clf,label in zip([clf1,clf2,clf3,sclf_1],['KNN','RFC','Naive Bayes','StackingClassifier_1']):
    scores=cross_val_score(clf,X,y,cv=3,scoring='accuracy')
    print("Accuracy: %0.4f (+/- %0.4f) [%s]"% (scores.mean(), scores.std(), label))
Accuracy: 0.9733 (+/- 0.0094) [KNN]
Accuracy: 0.9667 (+/- 0.0189) [RFC]
Accuracy: 0.9400 (+/- 0.0163) [Naive Bayes]
Accuracy: 0.9600 (+/- 0.0163) [StackingClassifier_1]

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值