[机器学习]GBDT源码(python)

最新推荐文章于 2024-07-10 22:38:17 发布

chaowu1993

最新推荐文章于 2024-07-10 22:38:17 发布

阅读量2.1k

点赞数

分类专栏： python 机器学习文章标签： GBDT 源码 python

本文链接：https://blog.csdn.net/weixin_40479663/article/details/84800837

版权

机器学习同时被 2 个专栏收录

19 篇文章 1 订阅

订阅专栏

python

16 篇文章 0 订阅

订阅专栏

GBDT/Adaboost/Xgboost详解可参考前面博文:详解
或者参考李航的<统计学习方法>
代码颗参考知乎:知乎
接下来附上代码片段(主要用于理解算法,没有实现运行):
其中super的继承讲解颗参考之前一篇文章.

# -*- coding: utf-8 -*-

from __future__ import division, print_function
import numpy as np
import progressbar

# Import helper functions
from utils import train_test_split, standardize, to_categorical
from utils import mean_squared_error, accuracy_score
from utils.loss_functions import SquareLoss, CrossEntropy, SotfMaxLoss
from decision_tree_model import RegressionTree
from utils.misc import bar_widgets


class GBDT(object):

    def __init__(self, n_estimators, learning_rate, min_samples_split,
                 min_impurity, max_depth, regression):

        self.n_estimators = n_estimators # 树的数量
        self.learning_rate = learning_rate # 学习率
        self.min_samples_split = min_samples_split # 每颗子树节点分割时候有的最小的样本数
        self.min_impurity = min_impurity # 最小纯度
        self.max_depth = max_depth # 每颗子树的最大层次
        self.regression = regression # 是否为回归问题

        # 进度条 processbar
        self.bar = progressbar.ProgressBar(widgets=bar_widgets)

        self.loss = SquareLoss()
        if not self.regression:
            self.loss = SotfMaxLoss()

        # 分类问题也使用回归树，利用残差去学习概率
        self.trees = []
        for i in range(self.n_estimators):
            self.trees.append(RegressionTree(min_samples_split=self.min_samples_split,
                                             min_impurity=self.min_impurity,
                                             max_depth=self.max_depth))

    def fit(self, X, y):
        # 主要思想就是用y和y_pred的梯度做残差rmi,把这个rmi作为'y'值与X进行fit得到一个新的树,新的树再去预测得到p_pred.继续循环
        # 让第一棵树去拟合模型
        self.trees[0].fit(X, y)
        y_pred = self.trees[0].predict(X)
        for i in self.bar(range(1, self.n_estimators)):
            gradient = self.loss.gradient(y, y_pred) # y和y_pred的梯度做为残差rmi
            self.trees[i].fit(X, gradient) # X和残差去fit拟合得到一个新的树
            y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))  # 新的树再去预测predict得到一个y_pred \
                                                             # 把上一个树的y_pred - learning_rate * y_pred的到最后的y_pred

    def predict(self, X):
        y_pred = self.trees[0].predict(X)
        for i in range(1, self.n_estimators):
            y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))

        if not self.regression:
            # 分类时候
            # Turn into probability distribution
            # 转化为概率分布 softmax
            y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
            # Set label to the value that maximizes probability
            # 将标签设置为概率最大的值
            y_pred = np.argmax(y_pred, axis=1)
        return y_pred


class GBDTRegressor(GBDT):
    def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,
                 min_var_red=1e-7, max_depth=4, debug=False):
        super(GBDTRegressor, self).__init__(n_estimators=n_estimators,
                                            learning_rate=learning_rate,
                                            min_samples_split=min_samples_split,
                                            min_impurity=min_var_red,
                                            max_depth=max_depth,
                                            regression=True)


class GBDTClassifier(GBDT):
    def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,
                 min_info_gain=1e-7, max_depth=2, debug=False):
        super(GBDTClassifier, self).__init__(n_estimators=n_estimators,
                                             learning_rate=learning_rate,
                                             min_samples_split=min_samples_split,
                                             min_impurity=min_info_gain,
                                             max_depth=max_depth,
                                             regression=False)

    def fit(self, X, y):
        y = to_categorical(y)
        super(GBDTClassifier, self).fit(X, y)