利用python实现GBDT回归树

1、关于GBDT回归树的原理参考这篇博文https://blog.csdn.net/zpalyq110/article/details/79527653,但想要深入理解算法的原理最好应该自己动手实现整个过程,本篇主要是用python简单实现了gbdt回归同时推导了下每次学习的残差与上一次迭代的关系

2、构建测试数据,做一个简单线性回归

def create_data():
    X = []
    for i in range(100):
        x = 2 * i
        y = 3 * i
        z = i
        l = x + y + z + np.random.rand() * 10
        X.append([x,y,z,l])
    return np.array(X)

data = create_data()

3、构建cart回归树,代码跟https://blog.csdn.net/Spirit_6275/article/details/89205571基本一样,只是会稍微改几个地方,一个损失函数改用均方误差,第二个是切分数据的时候以切分后的均方误差最小作为切分点

计算均方误差

def calc_mse(data):
    if len(data) == 0:
        return 0
    label = data[:,-1]
    return np.var(label) * len(label)

寻找最好的切分点

def select_split(data):
    min_gini = np.inf
    best_feat = None
    best_val = None
    left = None
    right = None
    data_type = 'continuity'
    for i in range(data.shape[1]-1):
        c_set = set(data[:, i])
        for val in c_set:
            arr1,arr2 = split_data(data,i,val,data_type)
            g1 = calc_mse(arr1)
            g2 = calc_mse(arr2)
            # g = len(arr1) / len(data) * g1 + len(arr2) / len(data) * g2 #基尼用于分类
            g = g1 + g2 # 获取剩下最小的均方误差
            # print(i,val,g)
            if min_gini > g:
                min_gini = g
                best_feat = i
                best_val = val
                left = arr1
                right = arr2
    return best_feat,best_val,left,right

4、构建gbdt回归树

def create_gbdt(dataset,n_estimators,lr):
    '''
    :param data: 输入数据
    :param n_estimators: 弱分类器的个数
    :param lr: 学习率,也叫残差学习率
    :return:
    '''
    data = copy.copy(dataset)
    tree_list = []
    tree_list.append(np.mean(data[:, -1]))
    data[:, -1] = data[:, -1] - np.mean(data[:, -1])
    tree_list.append(create_tree(data))
    for i in range(1,n_estimators):
        data[:,-1] = (1 - lr) * data[:,-1] # 剩余残差
        tree_list.append(create_tree(data))
    return tree_list

剩余残差的推导

我们知道gbdt的更新学习器的表达式为:f_{n} = f_{n-1} + lr * r_{n-1},其中r_{n-1} = y - f_{n-1}为负梯度,我们设F_{0},F_{1},F_{2},...,F_{n}为我们每次更新学习器的时候要学习的残差,其中F_{0}为均值为均值,F_{1} = y - F_{0},lr为学习率

f_{n} = F_{0} + lr * F_{1} + lr * F_{2} + ... + lr * F_{n}

f_{n-1} = F_{0} + lr * F_{1} + ... + lr * F_{n-1}

f_{n} = f_{n-1} + lr * r_{n-1}转化为f_{n} = f_{n-1} + lr * (y - f_{n-1})

继续化简lr * F_{n} = lr * (y - f_{n-1})推出F_{n} = y - f_{n-1},则F_{n-1} = y - f_{n-2}

那么F_{n} - F_{n-1} = f_{n-2} - f_{n-1} = -lr * F_{n-1}

所以F_{n} = (1 - lr) * F_{n-1},这个就为gbdt算法学习器每次用来学习的残差了

5、构建预测函数,利用递归函数构造

 

# 预测单颗树
def predict_one(tree,X):
    if type(tree) != dict:
        return tree
    for key in tree:
        if X[key[0]] < key[1]:
            r = tree[(key[0],key[1],'left')]
        else:
            r = tree[(key[0], key[1], 'right')]
        return predict_one(r, X)

# 预测
def predict(tree_list,X,lr):
    result = tree_list[0]
    for tree in tree_list[1:]:
        result += lr * predict_one(tree,X)
    return result

6、这里测试,构建了5个树

n_estimators = 5 
gbdt_tree = create_gbdt(data,n_estimators,0.1)
print("create gbdt:",predict(gbdt_tree,data[0,:-1]))

完整代码:

import numpy as np
from collections import Counter
from sklearn.ensemble import GradientBoostingRegressor
import copy


# gbdt回归数例子
data = np.array([[5,20,1.1],
                 [7,30,1.3],
                 [21,70,1.7],
                 [30,60,1.8]
                 ])

# 计算损失函数
def calc_mse(data):
    if len(data) == 0:
        return 0
    label = data[:,-1]
    return np.var(label) * len(label)


# 切分分类数据
def split_data(data,feat,val,data_type='classifier'):
    if data_type == 'classifier':
        arr1 = data[np.nonzero(data[:,feat] == val)]
        arr2 = data[np.nonzero(data[:,feat] != val)]
    else:
        arr1 = data[np.nonzero(data[:,feat].astype(float) < val)]
        arr2 = data[np.nonzero(data[:,feat].astype(float) >= val)]
    return arr1,arr2

# 连续变量的切分点处理
def continuity_params_process(arr,feat):
    c = arr[:,feat].astype(float)
    c_sort = sorted(set(c))
    new_c = []
    for i in range(len(c_sort)-1):
        val = (c_sort[i] + c_sort[i+1]) / 2
        new_c.append(val)
    return new_c

# 选择最好的切分点
# 满足基尼系数减少最快的方向
def select_split(data):
    min_gini = np.inf
    best_feat = None
    best_val = None
    left = None
    right = None
    data_type = 'continuity'
    for i in range(data.shape[1]-1):
        c_set = set(data[:, i])
        for val in c_set:
            arr1,arr2 = split_data(data,i,val,data_type)
            g1 = calc_mse(arr1)
            g2 = calc_mse(arr2)
            # g = len(arr1) / len(data) * g1 + len(arr2) / len(data) * g2
            g = g1 + g2 # 获取剩下最小的均方误差
            # print(i,val,g)
            if min_gini > g:
                min_gini = g
                best_feat = i
                best_val = val
                left = arr1
                right = arr2
    return best_feat,best_val,left,right

# 构建递归树
def create_tree(data):
    tree = {}
    if len(set(data[:,-1])) <= 1:
        return data[:,-1][0]
    # 如果数据的特征一模一样,则无法进一步切分
    # 返回
    dd = data[:,:-1].tolist()
    ddd = list(map(tuple,dd))
    cc = Counter(ddd)
    if len(cc) == 1:
        return np.mean(data[:,-1])
    best_feat,best_val,left,right = select_split(data)
    tree[(best_feat,best_val,'left')] = create_tree(left)
    tree[(best_feat,best_val,'right')] = create_tree(right)
    return tree

# 构建gbdt回归树
def create_gbdt(dataset,n_estimators,lr):
    '''
    :param data: 输入数据
    :param n_estimators: 弱分类器的个数
    :param lr: 学习率,也叫残差学习率
    :return:
    '''
    data = copy.copy(dataset)
    tree_list = []
    tree_list.append(np.mean(data[:, -1]))
    y = dataset[:,-1]
    data[:,-1] = np.mean(data[:,-1])
    for i in range(n_estimators):
        data[:,-1] = data[:,-1] + lr * (y - data[:,-1])
    tree = create_tree(data)
    return tree

# 预测单颗树
def predict(tree,X):
    if type(tree) != dict:
        return tree
    for key in tree:
        if X[key[0]] < key[1]:
            r = tree[(key[0],key[1],'left')]
        else:
            r = tree[(key[0], key[1], 'right')]
        return predict(r, X)


n_estimators = 3 #估计数
gbdt_tree = create_gbdt(data,n_estimators,0.1)
print("create gbdt:",predict(gbdt_tree,data[0,:-1]))

gbdt = GradientBoostingRegressor(n_estimators=n_estimators,learning_rate=0.1,max_depth=2)
gbdt.fit(data[:,:-1],data[:,-1])
print("GBDT:",gbdt.predict([data[0,:-1]]))

 

  • 6
    点赞
  • 31
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值