Section I: Code Bundle and Result Analyses
Personal Views:
Bagging In A Nutshell Lies in:
- Boostrap samples with replacement
- Draw features with replacement
代码:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
plt.rcParams['figure.dpi']=200
plt.rcParams['savefig.dpi']=200
font = {'family': 'Times New Roman',
'weight': 'light'}
plt.rc("font", **font)
#Section 1: Load data and split data into train/test datasets
wine=datasets.load_wine()
y=wine.target[wine.target!=1]
X=wine.data[:,[0,11]][wine.target!=1]
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y=le.fit_transform(y)
X_train,X_test,y_train,y_test=train_test_split(X,y,
test_size=0.2,
random_state=1,
stratify=y)
#Section 2: Performance evaluation between DecisionClassifierTree and BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
tree=DecisionTreeClassifier(criterion='entropy',
random_state=1,
max_depth=None)
bag=BaggingClassifier(base_estimator=tree,
n_estimators=500,
max_features=2,
bootstrap=True,
bootstrap_features=False,
n_jobs=1,
random_state=1)
from sklearn.metrics import accuracy_score
tree=tree.fit(X_train,y_train)
y_train_pred=tree.predict(X_train)
y_test_pred=tree.predict(X_test)
tree_train=accuracy_score(y_train,y_train_pred)
tree_test=accuracy_score(y_test,y_test_pred)
print("Decision tree train/test accuracies %.3f/%.3f" % (tree_train,tree_test))
bag=bag.fit(X_train,y_train)
y_train_pred=bag.predict(X_train)
y_test_pred=bag.predict(X_test)
bag_train=accuracy_score(y_train,y_train_pred)
bag_test=accuracy_score(y_test,y_test_pred)
print("Bag train/test accuracies %.3f/%.3f" % (bag_train,bag_test))
#Section 3: Visualize decision boundaries
x_min=X_train[:,0].min()-1
x_max=X_train[:,0].max()+1
y_min=X_train[:,1].min()-1
y_max=X_train[:,1].max()+1
import numpy as np
xx,yy=np.meshgrid(np.arange(x_min,x_max,0.1),
np.arange(y_min,y_max,0.1))
f,axarr=plt.subplots(nrows=1,ncols=2,
sharex='col',
sharey='row',
figsize=(8,3))
for idx,clf,tt in zip([0,1],
[tree,bag],
['Decision Tree','Bag']):
clf.fit(X_train,y_train)
Z=clf.predict(np.c_[xx.ravel(),yy.ravel()])
Z=Z.reshape(xx.shape)
axarr[idx].contourf(xx,yy,Z,alpha=0.3)
axarr[idx].scatter(X_train[y_train==0,0],X_train[y_train==0,1],c='blue',marker='^')
axarr[idx].scatter(X_train[y_train==1,0],X_train[y_train==1,1],c='red',marker='o')
axarr[idx].set_title(tt)
axarr[0].set_ylabel("Alcohol",fontsize=12)
plt.text(10.2,-1.2,s='Diluted Wines',ha='center',va='center',fontsize=12)
plt.tight_layout()
plt.savefig('./fig2.png')
plt.show()
结果:
预测精度:
Decision tree train/test accuracies 1.000/0.955
Bag train/test accuracies 1.000/0.955
参考文献:
Sebastian Raschka, Vahid Mirjalili. Python机器学习第二版. 南京:东南大学出版社,2018.