手撸AdaBoost

数学定义

如果latex表达式忘记怎么写的,可以参考这里
只看代码的的可以跳转
提升树 Boosting Tree GBDT

输入:训练数据集 T = { ( x 1 , y 1 ) , ( x 2 , y 2 ) , ⋯   , ( x N , y n ) } T= \{ (x_1, y_1),(x_2, y_2), \cdots, (x_N, y_n) \} T={(x1,y1),(x2,y2),,(xN,yn)},其中 x i ∈ χ ⊆ R n x_i \in \chi \subseteq R^n xiχRn, y i ∈ Y = { − 1 , 1 } ; 弱学习算法 y_i \in Y = \{-1, 1\}; 弱学习算法 yiY={1,1};弱学习算法
输出: 最终分类器 G ( x ) G(x) G(x)
(1) 初始化训练数据的权值分布
D 1 = ( w 11 , ⋯   , w 1 i , ⋯   , w 1 N ) , w 1 i = 1 N , i = 1 , 2 , … , N D_1=(w_{11}, \cdots, w_{1i}, \cdots, w_{1N}), w_{1i}=\frac{1}{N}, i=1,2,\dots,N D1=(w11,,w1i,,w1N),w1i=N1,i=1,2,,N
(2) 对 m = 1 , 2 , … , M m=1,2,\dots,M m=1,2,,M
( a ) (a) (a) 使用具有权值分布 D m D_m Dm的训练数据集学习,得到基本分类器
G m ( x ) : χ → { − 1 , 1 } G_m(x): \chi \rightarrow \{ -1, 1 \} Gm(x):χ{1,1}
( b ) (b) (b) 计算 G m ( x ) G_m(x) Gm(x)在训练数据集上的分类误差率
e m = ∑ i = 1 N P ( G m ( x i ) ≠ y i ) = ∑ i = 1 N w m i I ( G m ( x i ) ≠ y i ) e_m=\sum_{i=1}^{N}P(G_m(x_i)\ne y_i) = \sum_{i=1}^N w_{mi}I(G_m(x_i)\ne y_i) em=i=1NP(Gm(xi)=yi)=i=1NwmiI(Gm(xi)=yi)

( c ) (c) (c) 计算 G m ( x ) 的系数 G_m(x)的系数 Gm(x)的系数
α m = 1 2 log ⁡ 1 − e m e m \alpha_m=\frac{1}{2} \log \frac{1-e_m}{e_m} αm=21logem1em
这里的对数是自然对数
( d ) (d) (d)更新训练数据集的权值分布
D m + 1 = ( w m + 1 , 1 , ⋯   , w m + 1 , i , ⋯   , w m + 1 , N ) D_{m+1}=(w_{m+1,1},\cdots,w_{m+1,i}, \cdots, w_{m+1, N}) Dm+1=(wm+1,1,,wm+1,i,,wm+1,N)
w m + 1 , i = w m i Z m exp ⁡ ( − α m y i G m ( x ) ) , i = 1 , 2 , ⋯   , N w_{m+1, i} = \frac{w_{mi}}{Z_m} \exp (-\alpha_m y_i G_m(x)), i=1,2,\cdots,N wm+1,i=Zmwmiexp(αmyiGm(x)),i=1,2,,N
这里, Z m Z_m Zm是规范化因子
Z m = ∑ m = 1 N exp ⁡ ( − α m y i G m ( x ) ) Z_m=\sum_{m=1}^N \exp(-\alpha_m y_i G_m(x)) Zm=m=1Nexp(αmyiGm(x))
它使 D m + 1 D_{m+1} Dm+1成为一个概率分布。
(3) 构建基本分类器的线性组合
f ( x ) = ∑ m + 1 M α m G m ( x ) f(x)=\sum_{m+1}^M \alpha_m G_m(x) f(x)=m+1MαmGm(x)
得到最终分类器
G ( x ) = s i g n ( f ( x ) ) = s i g n ( ∑ m = 1 M α m G m ( x ) ) G(x)=sign(f(x)) = sign \bigl( \sum_{m=1}^M \alpha_m G_m(x) \bigr) G(x)=sign(f(x))=sign(m=1MαmGm(x))

代码实现

数据

请参考李航的统计学习方法

序号12345678910
x0123456789
y111-1-1-1111-1

代码

python代码

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", 20)
pd.set_option("display.max_rows", 20)
pd.set_option("display.width", 2000)
pd.set_option("display.max_colwidth", 2000)


class G:
    def __init__(self, threshold, which_type="<="):
        self.threshold = threshold
        self.which_type = which_type

    def __call__(self, x):
        if self.which_type == "<=":
            a = (x <= self.threshold) * 1
        elif self.which_type == ">=":
            a = (x >= self.threshold) * 1
        else:
            raise TypeError("类型错误, 必须是<= 或 >=")
        b = (a - 1) + a
        return b


class AdaBoostClassifier:
    def __init__(self, begin_w, x, y, m, threshold_candidate):
        self.m = m
        self.threshold_candidate = threshold_candidate
        self.D_list = []
        self.F_list = []
        self.begin_w = begin_w
        self.D_list.append(begin_w)
        self.x = x
        self.y = y

    def get_threshold_err(self, w):
        all = []
        for i in self.threshold_candidate:
            g_i_0 = G(i, "<=")
            g_i_1 = G(i, ">=")
            e_0 = np.sum((g_i_0(self.x) != self.y) * w)
            e_1 = np.sum((g_i_1(self.x) != self.y) * w)
            all.append((f"{i}_<=", e_0, g_i_0))
            all.append((f"{i}_>=", e_1, g_i_1))
        all = sorted(all, key=lambda x: x[1])
        return all[0]

    def get_alpha(self, e):
        return 0.5 * np.log((1 - e) / e)

    def get_next_weight(self, w, alpha, x, y, G_m):
        tmp = np.exp(-alpha * y * G_m(x)) * w
        return tmp / np.sum(tmp)

    def one_step(self, w):
        threshold_num, e, Gm = self.get_threshold_err(w)
        alpha = self.get_alpha(e)
        self.F_list.append((alpha, Gm, e))
        return self.get_next_weight(w, alpha, self.x, self.y, Gm)

    def fit(self):
        for i in range(self.m):
            w = self.one_step(self.D_list[i])
            self.D_list.append(w)
        return self.summary()

    def summary(self):
        df = pd.DataFrame({"w": [np.around(wi, 5) for wi in self.D_list],
                           "alpha": [g[0] for g in self.F_list] + ["-"],
                           "threshold": [g[1].threshold for g in self.F_list] + ["-"],
                           "type": [g[1].which_type for g in self.F_list]+["-"],
                           "err": [g[2] for g in self.F_list] + ["-"]})
        return df

    def predict(self, x):
        out = np.zeros_like(x, dtype=np.float32)
        for alpha, Gm, _ in self.F_list:
            out += alpha * Gm(x)
        return 2 * (out > 0) - 1


def main():
    # x 值
    x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int32)
    # y 值
    y = np.array([1, 1, 1, -1, -1, -1, 1, 1, 1, -1], dtype='i4')
    # 初始权重
    D1 = np.ones(10) * 0.1
    # 次数M
    m = 4
    # 阈值搜索空间
    threshold_candidate = np.array(range(1, 21, 2)) * 0.5
    # AdaBoost
    model = AdaBoostClassifier(D1, x, y, m, threshold_candidate)
    # 训练
    model.fit()
    # 查看训练结果
    print(model.summary())
    # 对x进行预测
    predict = model.predict(x)
    print(predict)
    # 将真实结果与预测结果对比
    print(predict == y)


if __name__ == '__main__':
    main()

结果输出

                                                                                            w     alpha threshold type       err
0                                          [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]  0.423649       2.5   <=       0.3
1  [0.07143, 0.07143, 0.07143, 0.07143, 0.07143, 0.07143, 0.16667, 0.16667, 0.16667, 0.07143]  0.649641       8.5   <=  0.214286
2  [0.04545, 0.04545, 0.04545, 0.16667, 0.16667, 0.16667, 0.10606, 0.10606, 0.10606, 0.04545]  0.752039       5.5   >=  0.181818
3          [0.125, 0.125, 0.125, 0.10185, 0.10185, 0.10185, 0.06481, 0.06481, 0.06481, 0.125]  0.710693       2.5   <=  0.194444
4  [0.07759, 0.07759, 0.07759, 0.06322, 0.06322, 0.06322, 0.16667, 0.16667, 0.16667, 0.07759]         -         -    -         -
[ 1  1  1 -1 -1 -1  1  1  1 -1]
[ True  True  True  True  True  True  True  True  True  True]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值