我使用的是决策树桩(单层决策树)作为基学习器,先实现这个决策树桩算法。
import numpy as np
import math
# 决策树树桩模型,只有一层的决策树
class DecisionStump:
def __init__(self):
self.bestFeature = -1 #数据集中特征索引的序数(从0开始)
self.threshold = math.inf # 切分的序数
self.error = math.inf
self.direction = 'more than'
#训练模型,weights默认为None时使用等权重
def train(self, X, y, weights=None):
m, n = X.shape
if weights is None:
weights=1/m*np.ones(m,dtype=np.float32)
for i in range(n):
values = X[:, i]
err, val, direct = self.__calStumps(values, y, weights)
if err < self.error:
self.error = err
self.threshold = val
self.direction = direct
self.bestFeature = i
# 根据第i维特征的数据计算最佳分割点,返回误差err,最佳阈值val,预测标签
@staticmethod
def __calStumps(values, y, weights):
m = len(values)
err = math.inf
bestThreshold = 0.0
sortedValues = sorted(values)
thresholds = [(sortedValues[i] + sortedValues[i + 1]) / 2 for i in range(len(sortedValues) - 1)]
thresholds.insert(0, sortedValues[0] - 1)
thresholds.append(sortedValues[-1] + 1)
for threshold in thresholds:
compare_array_positive = np.array([1 if values[k] > threshold else -1 for k in range(m)])
weight_error_positive = np.sum((compare_array_positive != y).astype(np.int) * weights)
compare_array_negative = np.array([-1 if values[k] > threshold else 1 for k in range(m)])
weight_error_negative = np.sum((compare_array_negative != y).astype(np.int) * weights)
if weight_error_positive < weight_error_negative:
weight_error = weight_error_positive
tmp_direct = 'more than'
else:
weight_error = weight_error_negative
tmp_direct = 'less than'
if weight_error < err:
err = weight_error
bestThreshold = threshold
direct = tmp_direct
return err, bestThreshold, direct
#使用训练的模型对新的样本数据集计算预测值
def predict(self, X):
values = X[:, self.bestFeature]
if self.direction == 'more than':
y_pred = np.array([1 if val > self.threshold else -1 for val in values])
else:
y_pred = np.array([-1 if val > self.threshold else 1 for val in values])
return y_pred
然后是Adaboost算法,该算法实现起来还是蛮简单的
class Adaboost:
def __init__(self, n_estimator=10):
'''
Parameters
----------
n_estimator
'''
self.weights = None # 每个样本的权重
self.n_estimator = n_estimator # 基学习器的个数
self.X = None
self.y = None
self.eps=1e-7
self.alphas = np.zeros(self.n_estimator) # 每个基学习器的权重
self.estimators = [] # 集成的已经训练的基学习器模型
def __initParams(self, X, y):
m, n = X.shape
self.X = X
self.y = y
self.weights = 1 / m * np.ones(m,dtype=np.float32)
def train(self, X, y):
self.__initParams(X, y)
for i in range(self.n_estimator):
base = DecisionStump()
base.train(self.X, self.y, self.weights)
y_pred = base.predict(X)
self.estimators.append(base)
err = np.sum(self.weights * (y_pred != self.y).astype(np.int))
if err<self.eps:
print("Encounter Strong learner!")
break
self.alphas[i] = 1 / 2 * math.log((1 - err) / err)
Z = np.sum(self.weights * np.exp(-err * self.y * y_pred))
self.weights = self.weights / Z * np.exp(-err * self.y * y_pred)
测试使用一维类别数据和四维水仙花数据:
很巧的是取了两类水仙花数据进行二分类,很巧的是,DecisionStump就能对水仙花数据进行很好的分类,所以使用Adaboost时会遇到0误差的基模型情况。
#test1
X = np.arange(10).reshape(10, 1)
y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
m, n = X.shape
model1=DecisionStump()
model1.train(X,y)
print(model1.threshold) # threshold=2.5
print((model1.predict(X)==y).astype(np.int).mean())
# test2
model2=Adaboost(15)
model2.train(X,y)
print(len(model2.estimators))
print((model2.predict(X)==y).astype(np.int).mean())
# test3
from sklearn import datasets
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X = iris.data
y = iris.target
X=X[y!=2]
y=y[y!=2]
y[y==0]=-1
plt.scatter(X[y==-1,2],X[y==-1,0])
plt.scatter(X[y==1,2],X[y==1,0])
plt.show()
model3=DecisionStump()
model3.train(X,y)
print(model3.bestFeature) #bestFeature=2
print(model3.threshold) # threshold=1.9
y_hat=model3.predict(X)
print((y_hat==y).astype(np.int).mean())
# test4
model4=Adaboost()
model4.train(X,y)
print(len(model4.estimators))
print((model4.predict(X)==y).astype(np.int).mean())