1、关于GBDT回归树的原理参考这篇博文https://blog.csdn.net/zpalyq110/article/details/79527653,但想要深入理解算法的原理最好应该自己动手实现整个过程,本篇主要是用python简单实现了gbdt回归同时推导了下每次学习的残差与上一次迭代的关系
2、构建测试数据,做一个简单线性回归
def create_data():
X = []
for i in range(100):
x = 2 * i
y = 3 * i
z = i
l = x + y + z + np.random.rand() * 10
X.append([x,y,z,l])
return np.array(X)
data = create_data()
3、构建cart回归树,代码跟https://blog.csdn.net/Spirit_6275/article/details/89205571基本一样,只是会稍微改几个地方,一个损失函数改用均方误差,第二个是切分数据的时候以切分后的均方误差最小作为切分点
计算均方误差
def calc_mse(data):
if len(data) == 0:
return 0
label = data[:,-1]
return np.var(label) * len(label)
寻找最好的切分点
def select_split(data):
min_gini = np.inf
best_feat = None
best_val = None
left = None
right = None
data_type = 'continuity'
for i in range(data.shape[1]-1):
c_set = set(data[:, i])
for val in c_set:
arr1,arr2 = split_data(data,i,val,data_type)
g1 = calc_mse(arr1)
g2 = calc_mse(arr2)
# g = len(arr1) / len(data) * g1 + len(arr2) / len(data) * g2 #基尼用于分类
g = g1 + g2 # 获取剩下最小的均方误差
# print(i,val,g)
if min_gini > g:
min_gini = g
best_feat = i
best_val = val
left = arr1
right = arr2
return best_feat,best_val,left,right
4、构建gbdt回归树
def create_gbdt(dataset,n_estimators,lr):
'''
:param data: 输入数据
:param n_estimators: 弱分类器的个数
:param lr: 学习率,也叫残差学习率
:return:
'''
data = copy.copy(dataset)
tree_list = []
tree_list.append(np.mean(data[:, -1]))
data[:, -1] = data[:, -1] - np.mean(data[:, -1])
tree_list.append(create_tree(data))
for i in range(1,n_estimators):
data[:,-1] = (1 - lr) * data[:,-1] # 剩余残差
tree_list.append(create_tree(data))
return tree_list
剩余残差的推导
我们知道gbdt的更新学习器的表达式为:,其中
为负梯度,我们设
为我们每次更新学习器的时候要学习的残差,其中
为均值,
,lr为学习率
则转化为
继续化简推出
,则
那么
所以,这个就为gbdt算法学习器每次用来学习的残差了
5、构建预测函数,利用递归函数构造
# 预测单颗树
def predict_one(tree,X):
if type(tree) != dict:
return tree
for key in tree:
if X[key[0]] < key[1]:
r = tree[(key[0],key[1],'left')]
else:
r = tree[(key[0], key[1], 'right')]
return predict_one(r, X)
# 预测
def predict(tree_list,X,lr):
result = tree_list[0]
for tree in tree_list[1:]:
result += lr * predict_one(tree,X)
return result
6、这里测试,构建了5个树
n_estimators = 5
gbdt_tree = create_gbdt(data,n_estimators,0.1)
print("create gbdt:",predict(gbdt_tree,data[0,:-1]))
完整代码:
import numpy as np
from collections import Counter
from sklearn.ensemble import GradientBoostingRegressor
import copy
# gbdt回归数例子
data = np.array([[5,20,1.1],
[7,30,1.3],
[21,70,1.7],
[30,60,1.8]
])
# 计算损失函数
def calc_mse(data):
if len(data) == 0:
return 0
label = data[:,-1]
return np.var(label) * len(label)
# 切分分类数据
def split_data(data,feat,val,data_type='classifier'):
if data_type == 'classifier':
arr1 = data[np.nonzero(data[:,feat] == val)]
arr2 = data[np.nonzero(data[:,feat] != val)]
else:
arr1 = data[np.nonzero(data[:,feat].astype(float) < val)]
arr2 = data[np.nonzero(data[:,feat].astype(float) >= val)]
return arr1,arr2
# 连续变量的切分点处理
def continuity_params_process(arr,feat):
c = arr[:,feat].astype(float)
c_sort = sorted(set(c))
new_c = []
for i in range(len(c_sort)-1):
val = (c_sort[i] + c_sort[i+1]) / 2
new_c.append(val)
return new_c
# 选择最好的切分点
# 满足基尼系数减少最快的方向
def select_split(data):
min_gini = np.inf
best_feat = None
best_val = None
left = None
right = None
data_type = 'continuity'
for i in range(data.shape[1]-1):
c_set = set(data[:, i])
for val in c_set:
arr1,arr2 = split_data(data,i,val,data_type)
g1 = calc_mse(arr1)
g2 = calc_mse(arr2)
# g = len(arr1) / len(data) * g1 + len(arr2) / len(data) * g2
g = g1 + g2 # 获取剩下最小的均方误差
# print(i,val,g)
if min_gini > g:
min_gini = g
best_feat = i
best_val = val
left = arr1
right = arr2
return best_feat,best_val,left,right
# 构建递归树
def create_tree(data):
tree = {}
if len(set(data[:,-1])) <= 1:
return data[:,-1][0]
# 如果数据的特征一模一样,则无法进一步切分
# 返回
dd = data[:,:-1].tolist()
ddd = list(map(tuple,dd))
cc = Counter(ddd)
if len(cc) == 1:
return np.mean(data[:,-1])
best_feat,best_val,left,right = select_split(data)
tree[(best_feat,best_val,'left')] = create_tree(left)
tree[(best_feat,best_val,'right')] = create_tree(right)
return tree
# 构建gbdt回归树
def create_gbdt(dataset,n_estimators,lr):
'''
:param data: 输入数据
:param n_estimators: 弱分类器的个数
:param lr: 学习率,也叫残差学习率
:return:
'''
data = copy.copy(dataset)
tree_list = []
tree_list.append(np.mean(data[:, -1]))
y = dataset[:,-1]
data[:,-1] = np.mean(data[:,-1])
for i in range(n_estimators):
data[:,-1] = data[:,-1] + lr * (y - data[:,-1])
tree = create_tree(data)
return tree
# 预测单颗树
def predict(tree,X):
if type(tree) != dict:
return tree
for key in tree:
if X[key[0]] < key[1]:
r = tree[(key[0],key[1],'left')]
else:
r = tree[(key[0], key[1], 'right')]
return predict(r, X)
n_estimators = 3 #估计数
gbdt_tree = create_gbdt(data,n_estimators,0.1)
print("create gbdt:",predict(gbdt_tree,data[0,:-1]))
gbdt = GradientBoostingRegressor(n_estimators=n_estimators,learning_rate=0.1,max_depth=2)
gbdt.fit(data[:,:-1],data[:,-1])
print("GBDT:",gbdt.predict([data[0,:-1]]))