集成方法
集成方法的目标是结合若干基估计器的预测来改善单个估计器的泛化能力或鲁棒性。
通常由两类集成方法:
- 平均方法:独自构建若干估计器,然后平均它们的预测值,如,Bagging,Forests of randomized trees
- 改善方法:依序建立基估计器,然后试图减少组合估计器的偏差,如,AdaBoost,Gradient Tree Boosting
Bagging meta-estimator
在集成算法中,bagging方法形成了一套算法,在原始训练数据集中的随机部分子集上面构建若干个黑盒估计器,然后聚集它们的预测结果以形成最终的预测。这些方法通过引入随机性到基础估计器的构建过程中,减少单个估计器的方差,然后做一个集成。在许多情况下,bagging方法不需要修改潜在的基础算法。
在scikit-learn
中,BaggingClassifier
提供了同一的元估计器。Bagging方法不同于它们抽取训练集子集的方法不同。
# coding: utf-8
# Single estimator versus bagging: bias-variance decomposition
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
n_repeat = 50
n_train = 50
n_test = 1000
noise = 0.1
np.random.seed(0)
estimators = [("Tree", DecisionTreeRegressor()),
("Bagging(Tree)", BaggingRegressor(DecisionTreeRegressor()))]
n_estimators = len(estimators)
def f(x):
x = x.ravel()
return np.exp(-x ** 2) + 1.5 * np.exp(-(x - 2) ** 2)
def generate(n_samples, noise, n_repeat=1):
X = np.random.rand(n_samples) * 10 - 5
X = np.sort(X)
if n_repeat == 1:
y = f(X) + np.random.normal(0.0, noise, n_samples)
else:
y = np.zeros((n_samples, n_repeat))
for i in range(n_repeat):
y[:, i] = f(X) + np.random.normal(0.0, noise, n_samples)
X = X.reshape((n_samples, 1))
return X, y
X_train = []
y_train = []
for i in range(n_repeat):
X, y = generate(n_samples=n_train, noise=noise)
X_train.append(X)
y_train.append(y)
X_test, y_test = generate(n_samples=n_test, noise=noise, n_repeat=n_repeat)
plt.figure(figsize=(10, 8))
for n, (name, estimator) in enumerate(estimators):
y_predict = np.zeros((n_test, n_repeat))
for i in range(n_repeat):
estimator.fit(X_train[i], y_train[i])
y_predict[:, i] = estimator.predict(X_test)
y_error = np.zeros(n_test)
for i in range(n_repeat):
for j in range(n_repeat):
y_error += (y_test[:, j] - y_predict[:, i]) ** 2
y_error /= (n_repeat * n_repeat)
y_noise = np.var(y_test, axis=1)
y_bias = (f(X_test) - np.mean(y_predict, axis=1)) ** 2
y_var = np.var(y_predict, axis=1)
print("{0}: {1:.4f} (error) = {2:.4f} (bias^2) "
" + {3:.4f} (var) + {4:.4f} (noise)".format(name,
np.mean(y_error),
np.mean(y_bias),
np.mean(y_var),
np.mean(y_noise)))
plt.subplot(2, n_estimators, n + 1)
plt.plot(X_test, f(X_test), "b", label="$f(x)$")
plt.plot(X_train[0], y_train[0], ".b", label="LS ~ $y = f(x)+noise$")
for i in range(n_repeat):
if i == 0:
plt.plot(X_test, y_predict[:, i], "r", label=r"$\^y(x)$")
else:
plt.plot(X_test, y_predict[:, i], "r", alpha=0.05)
plt.plot(X_test, np.mean(y_predict, axis=1), "c",
label=r"$\mathbb{E}_{LS} \^y(x)$")
plt.xlim([-5, 5])
plt.title(name)
if n == n_estimators - 1:
plt.legend(loc=(1.1, .5))
plt.subplot(2, n_estimators, n_estimators + n + 1)
plt.plot(X_test, y_error, "r", label="$error(x)$")
plt.plot(X_test, y_bias, "b", label="$bias^2(x)$"),
plt.plot(X_test, y_var, "g", label="$variance(x)$"),
plt.plot(X_test, y_noise, "c", label="$noise(x)$")
plt.xlim([-5, 5])
plt.ylim([0, 0.1])
if n == n_estimators - 1:
plt.legend(loc=(1.1, .5))
plt.subplots_adjust(right=.75)
plt.show()
随机森林
sklearn.ensemble
模块包含基于随机决策树的两种平均算法:RandomForest algorithms
和Extra-Trees method
。集成的预测是单个分类器预测的均值。
AdaBoost
AdaBoost的核心原则是重复在数据集修改的副本上,拟合一系列弱学习器。
所有这些预测结果通过权重大多数投票结合产生最后的预测值。在每次提升迭代包含使用权重到每个训练样本,以产生每个数据的副本。
# coding: utf-8
# Decision Tree Regression with AdaBoost
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
rng = np.random.RandomState(1)
X = np.linspace(0, 6, 100)[:, np.newaxis]
y = np.sin(X).ravel() + np.sin(6 * X).ravel() + rng.normal(0, 0.1, X.shape[0])
regr_1 = DecisionTreeRegressor(max_depth=4)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
n_estimators=300, random_state=rng)
regr_1.fit(X, y)
regr_2.fit(X, y)
y_1 = regr_1.predict(X)
y_2 = regr_2.predict(X)
plt.figure()
plt.scatter(X, y, c='k', label='training samples')
plt.plot(X, y_1, c="g", label="n_estimators=1", linewidth=2)
plt.plot(X, y_2, c="r", label="n_estimators=300", linewidth=2)
plt.xlabel("data")
plt.ylabel("target")
plt.title("Boosted Decision Tree Regression")
plt.legend()
plt.show()
Gradient Tree Boosting
Gradient Tree Boosting
或Gradient Boosted Decision Tree
是一种可以提升任意可微损失函数的泛化方法,其可用于回归和分类问题。
GradientBoostingClassifier
支持二元和多类别分类GradientBoostingRegressor
支持许多不同的损失函数,并通过参数loss
指定
# coding: utf-8
# Gradient Boosting regression
from sklearn import datasets, ensemble
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=13)
params = {'n_estimators': 500,
'max_depth': 4,
'min_samples_split': 5,
'learning_rate': 0.01,
'loss': 'ls'}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse))
Histogram-Based Gradient Boosting
Histogram-based估计器比GradientBoostingClassifier
和GradientBoostingRegressor
快,当样本的数量成千上万时。
Voting Classifier
VotingClassifier
的原理是结合不同的机器学些分类器,然后使用大多数投票或平均预测概率来预测类别。这类分类器适用于性能相同的模型以平衡它们各自的缺点。
大多数投票
在大多数投票中,将参数voting='hard'
,对某一样本预测类别的标签表示单个分类器预测类别的大多数。
# coding: utf-8
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(
estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)],
voting='hard')
for clf, label in zip([clf1, clf2, clf3, eclf],
['LR', 'RF', 'NB', 'En']):
scores = cross_val_score(clf, X, y, scoring='accuracy', cv=5)
print('Accuracy: %0.2f (+/- %0.2f) [%s]' % (scores.mean(),
scores.std(),
label))
权重平均概率(软投票)
软投票返回的类别是预测概率和中最大值对应的类别。
# coding: utf-8
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from itertools import product # 笛卡尔积
from sklearn.ensemble import VotingClassifier
iris = datasets.load_iris()
X, y = iris.data[:, [0, 2]], iris.target
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(kernel='rbf', probability=True)
eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),
('svc', clf3)],
voting='soft', weights=[2, 1, 2])
clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
eclf.fit(X, y)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1))
f, axarr = plt.subplots(2, 2, sharex='col', sharey='row', figsize=(10, 8))
for idx, clf, tt in zip(product([0, 1], [0, 1]),
[clf1, clf2, clf3, eclf],
['Decision Tree (depth=4)', 'KNN (k=7)',
'Kernel SVM', 'Soft Voting']):
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
axarr[idx[0], idx[1]].contourf(xx, yy, Z, alpha=0.4)
axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y,
s=20, edgecolor='k')
axarr[idx[0], idx[1]].set_title(tt)
plt.show()
投票回归器
VotingRegressor
的原理是结合不同的机器学些分类器,然后平均预测值。
# coding: utf-8
import matplotlib.pyplot as plt
from sklearn.datasets import load_diabetes
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
X, y = load_diabetes(return_X_y=True)
reg1 = GradientBoostingRegressor(random_state=1)
reg2 = RandomForestRegressor(random_state=1)
reg3 = LinearRegression()
reg1.fit(X, y)
reg2.fit(X, y)
reg3.fit(X, y)
ereg = VotingRegressor([('gb', reg1), ('rf', reg2), ('lr', reg3)])
ereg.fit(X, y)
xt = X[:20]
pred1 = reg1.predict(xt)
pred2 = reg2.predict(xt)
pred3 = reg3.predict(xt)
pred4 = ereg.predict(xt)
plt.figure()
plt.plot(pred1, 'gd', label='GradientBoostingRegressor')
plt.plot(pred2, 'b^', label='RandomForestRegressor')
plt.plot(pred3, 'ys', label='LinearRegression')
plt.plot(pred4, 'r*', ms=10, label='VotingRegressor')
plt.tick_params(axis='x', which='both', bottom=False, top=False,
labelbottom=False)
plt.ylabel('predicted')
plt.xlabel('training samples')
plt.legend(loc="best")
plt.title('Regressor predictions and their average')
plt.show()
堆叠泛化
堆叠泛化结合不同的估计器以减少偏差。更详细地,单个估计器的预测值堆叠在一起作为最终估计器的输入,以完成预测。最后的估计器使用交叉验证训练。
StackingClassifier
和StackingRegressor
提供了可应用于分类和回归问题的策略。
# coding: utf-8
from sklearn.datasets import load_boston
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
estimators = [('ridge', RidgeCV()),
('lasso', LassoCV(random_state=42)),
('svr', SVR(C=1, gamma=1e-6))]
reg = StackingRegressor(
estimators=estimators, final_estimator=GradientBoostingRegressor(random_state=42))
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
print('R2 score: {:.2f}'.format(r2_score(y_test, y_pred)))