AdaBoost从原理推导到源代码实现（二）

To_be_thinking

于 2019-06-17 22:19:29 发布

阅读量328

点赞数

分类专栏：算法设计（Practice）统计机器学习

本文链接：https://blog.csdn.net/To_be_to_thought/article/details/92727140

版权

算法设计（Practice）同时被 2 个专栏收录

43 篇文章 1 订阅

订阅专栏

统计机器学习

22 篇文章 2 订阅

订阅专栏

我使用的是决策树桩（单层决策树）作为基学习器，先实现这个决策树桩算法。

import numpy as np
import math

# 决策树树桩模型，只有一层的决策树
class DecisionStump:
    def __init__(self):
        self.bestFeature = -1  #数据集中特征索引的序数（从0开始）
        self.threshold = math.inf   # 切分的序数
        self.error = math.inf
        self.direction = 'more than'

    #训练模型，weights默认为None时使用等权重
    def train(self, X, y, weights=None):
        m, n = X.shape
        if weights is None:
            weights=1/m*np.ones(m,dtype=np.float32)
        for i in range(n):
            values = X[:, i]
            err, val, direct = self.__calStumps(values, y, weights)
            if err < self.error:
                self.error = err
                self.threshold = val
                self.direction = direct
                self.bestFeature = i

    # 根据第i维特征的数据计算最佳分割点，返回误差err，最佳阈值val,预测标签
    @staticmethod
    def __calStumps(values, y, weights):
        m = len(values)
        err = math.inf
        bestThreshold = 0.0
        sortedValues = sorted(values)
        thresholds = [(sortedValues[i] + sortedValues[i + 1]) / 2 for i in range(len(sortedValues) - 1)]
        thresholds.insert(0, sortedValues[0] - 1)
        thresholds.append(sortedValues[-1] + 1)
        for threshold in thresholds:
            compare_array_positive = np.array([1 if values[k] > threshold else -1 for k in range(m)])
            weight_error_positive = np.sum((compare_array_positive != y).astype(np.int) * weights)
            compare_array_negative = np.array([-1 if values[k] > threshold else 1 for k in range(m)])
            weight_error_negative = np.sum((compare_array_negative != y).astype(np.int) * weights)
            if weight_error_positive < weight_error_negative:
                weight_error = weight_error_positive
                tmp_direct = 'more than'
            else:
                weight_error = weight_error_negative
                tmp_direct = 'less than'
            if weight_error < err:
                err = weight_error
                bestThreshold = threshold
                direct = tmp_direct
        return err, bestThreshold, direct

    #使用训练的模型对新的样本数据集计算预测值
    def predict(self, X):
        values = X[:, self.bestFeature]
        if self.direction == 'more than':
            y_pred = np.array([1 if val > self.threshold else -1 for val in values])
        else:
            y_pred = np.array([-1 if val > self.threshold else 1 for val in values])
        return y_pred

然后是Adaboost算法，该算法实现起来还是蛮简单的

class Adaboost:
    def __init__(self, n_estimator=10):
        '''

        Parameters
        ----------
        n_estimator
        '''
        self.weights = None  # 每个样本的权重
        self.n_estimator = n_estimator  # 基学习器的个数
        self.X = None
        self.y = None
        self.eps=1e-7
        self.alphas = np.zeros(self.n_estimator)  # 每个基学习器的权重
        self.estimators = []  # 集成的已经训练的基学习器模型

    def __initParams(self, X, y):
        m, n = X.shape
        self.X = X
        self.y = y
        self.weights = 1 / m * np.ones(m,dtype=np.float32)

    def train(self, X, y):
        self.__initParams(X, y)
        for i in range(self.n_estimator):
            base = DecisionStump()
            base.train(self.X, self.y, self.weights)
            y_pred = base.predict(X)
            self.estimators.append(base)
            err = np.sum(self.weights * (y_pred != self.y).astype(np.int))
            if err<self.eps:
                print("Encounter Strong learner!")
                break
            self.alphas[i] = 1 / 2 * math.log((1 - err) / err)
            Z = np.sum(self.weights * np.exp(-err * self.y * y_pred))
            self.weights = self.weights / Z * np.exp(-err * self.y * y_pred)

测试使用一维类别数据和四维水仙花数据：

很巧的是取了两类水仙花数据进行二分类，很巧的是，DecisionStump就能对水仙花数据进行很好的分类，所以使用Adaboost时会遇到0误差的基模型情况。

#test1
X = np.arange(10).reshape(10, 1)
y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1])
m, n = X.shape

model1=DecisionStump()
model1.train(X,y)
print(model1.threshold)  #     threshold=2.5
print((model1.predict(X)==y).astype(np.int).mean())

# test2
model2=Adaboost(15)
model2.train(X,y)
print(len(model2.estimators))
print((model2.predict(X)==y).astype(np.int).mean())

# test3
from sklearn import datasets
import matplotlib.pyplot as plt
iris = datasets.load_iris()
X = iris.data
y = iris.target
X=X[y!=2]
y=y[y!=2]
y[y==0]=-1
plt.scatter(X[y==-1,2],X[y==-1,0])
plt.scatter(X[y==1,2],X[y==1,0])
plt.show()


model3=DecisionStump()
model3.train(X,y)
print(model3.bestFeature)   #bestFeature=2
print(model3.threshold)     # threshold=1.9
y_hat=model3.predict(X)
print((y_hat==y).astype(np.int).mean())

# test4
model4=Adaboost()
model4.train(X,y)
print(len(model4.estimators))
print((model4.predict(X)==y).astype(np.int).mean())

To_be_thinking

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
AdaBoost从原理推导到源代码实现（二）

我使用的是决策树桩（单层决策树）作为基学习器，先实现这个决策树桩算法。import numpy as npimport math# 决策树树桩模型，只有一层的决策树class DecisionStump: def __init__(self): self.bestFeature = -1 #数据集中特征索引的序数（从0开始） self.thr...
复制链接

扫一扫