我们以Blending方式对iris数据集进行预测
1.大致了解数据集的内容
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target
features = iris.feature_names
print(features)
iris_data = pd.DataFrame(X, columns = features)
iris_data['target'] = y
iris_data.head()
2.获取需要的数据
iris_data = iris_data[iris_data['target'] != 2]
y = iris_data['target'].values
X = iris_data[[features[0], features[2]]].values
print("the shape of y:", y.shape)
print("the shape of X:", X.shape)
这里只用了两个类别,并且只使用了两个特征
3.将数据划分为训练集、验证集和测试集
from sklearn.model_selection import train_test_split
## 创建训练集和测试集
X_train1,X_test,y_train1,y_test = train_test_split(X, y, test_size=0.2, random_state=1)
## 创建训练集和验证集
X_train,X_val,y_train,y_val = train_test_split(X_train1, y_train1, test_size=0.3, random_state=1)
print("The shape of training X:",X_train.shape)
print("The shape of training y:",y_train.shape)
print("The shape of test X:",X_test.shape)
print("The shape of test y:",y_test.shape)
print("The shape of validation X:",X_val.shape)
print("The shape of validation y:",y_val.shape)
4.设置第一层和第二层的分类器
# 设置第一层分类器
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
clfs = [SVC(probability = True),RandomForestClassifier(n_estimators=5, n_jobs=-1, criterion='gini'),KNeighborsClassifier()]
# 设置第二层分类器
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
5.使用训练集训练第一层分类器,并输出第一层的验证集和测试集结果
# 输出第一层的验证集结果与测试集结果
print("The shape of X_val:", X_val.shape)
val_features = np.zeros((X_val.shape[0],len(clfs))) # 初始化验证集结果
print("The shape of val_features:", val_features.shape)
test_features = np.zeros((X_test.shape[0],len(clfs))) # 初始化测试集结果
print("The shape of test:", test_features.shape)
for i,clf in enumerate(clfs):
clf.fit(X_train,y_train)
val_feature = clf.predict_proba(X_val)[:, 1]
print('The shape of val_feature:', val_feature.shape)
test_feature = clf.predict_proba(X_test)[:,1]
print('The shape of test_feature:', test_feature.shape)
val_features[:,i] = val_feature
test_features[:,i] = test_feature
print('The shape of val_features:', val_features.shape)
print('The shape of test_features:', test_features.shape)
6.用第一层训练集输出训练第二层FC,得到输出结果
# 将第一层的验证集的结果输入第二层训练第二层分类器
lr.fit(val_features,y_val)
print("the shape of test_features:", test_features.shape)
print("the shape of X_test:", X_test.shape)
print("the shape of y_test", y_test.shape)
# 输出预测的结果
from sklearn.model_selection import cross_val_score
cross_val_score(lr,test_features,y_test,cv=5)
7.画出训练好的分类器的分隔线
x_min = X[:, 0].min() - 1
x_max = X[:, 0].max() + 1
y_min = X[:, 1].min() - 1
y_max = X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(nrows=1, ncols=1,sharex='col',sharey='row',figsize=(6, 6))
feature_grid = np.c_[xx.ravel(), yy.ravel()]
features = np.zeros((feature_grid.shape[0], len(clfs)))
for i,clf in enumerate(clfs):
feature = clf.predict_proba(feature_grid)[:, 1]
print('The shape of feature:', feature.shape)
features[:,i] = feature
Z = lr.predict(features)
Z = Z.reshape(xx.shape)
axarr.contourf(xx, yy, Z, alpha = 0.3)
axarr.scatter(X[y == 0, 0], X[y == 0, 1], c='blue', marker='^')
axarr.scatter(X[y == 1, 0], X[y == 1, 1], c='red', marker='o')