Voting就是投票的意思。这种集成算法一般应用与分类问题。思路很简单。假如有5种机器学习模型来进行分类预测,就拥有5个预测的结果集,那么这5种模型,一种模型一票。然后遵循少数服从多数原则。
from sklearn import datasets
import matplotlib.pyplot as plt
'''
制作样本数据,产生的结果为一个简单的样本数据集,用于可视化聚类算法和分类算法
1. n_samples : 整数型, 可选,默认为100 总的产生的样本点的数量
2. shuffle : 布尔型,可选填 (默认为True) 是否对样本进行重新洗牌
3. noise : 浮点型 or None型 (默认为None) 加到数据里面的高斯噪声的标准差
'''
X, y = datasets.make_moons(n_samples=500, shuffle=True, noise=0.3, random_state=42)
plt.scatter(X[y == 0, 0], X[y == 0, 1])
plt.scatter(X[y == 1, 0], X[y == 1, 1])
plt.show()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# 采用逻辑回归,进行分类
from sklearn.linear_model import LogisticRegression
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
y_predict1 = log_clf.predict(X_test)
print(log_clf.score(X_test, y_test))
# 采用K近邻进行分类
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
y_predict2 = knn_clf.predict(X_test)
print(knn_clf.score(X_test, y_test))
# 采用决策树进行分类
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=1)
dt_clf.fit(X_train, y_train)
y_predict3 = dt_clf.predict(X_test)
print(dt_clf.score(X_test, y_test))
import numpy as np
from sklearn.metrics.classification import accuracy_score
# 手动实现集成学习,并且提升正确率
y_predict = np.array((y_predict1 + y_predict2 + y_predict3) >= 2, dtype=np.int)
print(y_predict1[:10])
print(y_predict2[:10])
print(y_predict3[:10])
print(y_predict[:10])
print(accuracy_score(y_test, y_predict))
from sklearn.ensemble import VotingClassifier
# 创建集成学习对象
voting_clf = VotingClassifier(estimators=[('log_clf', LogisticRegression()),
('knn_clf', KNeighborsClassifier()),
('dt_clf', DecisionTreeClassifier(random_state=1))
])
voting_clf.fit(X_train, y_train)
print(voting_clf.score(X_test, y_test))
out:
0.864
0.912
0.856
[1 0 0 1 1 1 0 0 0 0]
[0 0 1 1 1 1 0 0 0 0]
[1 0 1 1 1 1 0 0 0 0]
[1 0 1 1 1 1 0 0 0 0]
0.92
0.92