24/8/11算法笔记GBDT 梯度提升树

X,y = datasets.load_iris(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1124)
普通决策树
model = DecisionTreeClassifier()#一棵树,单打独斗,分类:信息熵。gini系数
model.fit(X_train,y_train)
print('决策分类树得分',model.score(X_test,y_test))

dot_data = tree.export_graphviz(model,filled=True,rounded=True)
graphviz.Source(dot_data)

梯度提升树
model = GradientBoostingClassifier(subsample = 0.8,#随机抽样比例
                                   learning_rate=0.005)#学习率
model.fit(X_train,y_train)
print('梯度提升树得分',model.score(X_test,y_test))

#三分类,100*3=300
dot_data = tree.export_graphviz(model[0,2],filled=True,rounded=True)
graphviz.Source(dot_data)

GBDT梯度分类树算法示例

创建数据
import numpy as np
import graphviz
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
X = np.arange(1,11).reshape(-1,1)
y = np.array([0,0,0,1,1]*2)
display(X,y)

构建GBDT训练和预测
clf = GradientBoostingClassifier(n_estimators=3,learning_rate=0.1,max_depth=1)
clf.fit(X,y)
y_ = clf.predict(X)
print('预测结果是:',y_)

clf.predict_proba(X)

第一棵树可视化
dot_data = tree.export_graphviz(clf[0,0],filled=True,rounded=True)
graphviz.Source(dot_data)

第二棵树
dot_data = tree.export_graphviz(clf[1,0],filled=True,rounded=True)
graphviz.Source(dot_data)

第三棵树
dot_data = tree.export_graphviz(clf[2,0],filled=True,rounded=True)
graphviz.Source(dot_data)

拟合第一棵树

y

#函数的初始值
F0 = np.log(y.sum()/(1-y).sum())
F0

#计算残差,真实值和预测值之间的差
residual0=y-1/(1+np.exp(-FO))
residual0

mse均方误差,是衡量模型预测值与实际观测值之间差异的一种常用指标,广泛应用于回归问题中评估模型的性能。

#未进行分裂时,均方误差
((residual0-residual0.mean())**2).mean()

X.reshape(-1)

lower_mse = 0.24
best_split={}
#分裂标准
for i in range(0,10):   #i从0开始
    if i ==9:
        mse=((residual0-residual0.mean())**2).mean()
    else:
        left=residual0[:i+1]
        right=residual0[i+1:]
        left_mse= ((left-left.mean())**2).mean()
        right_mse = ((right-right.mean())**2).mean()
        mse = left_mse*(i+1)/10+right_mse*(10-i-1)/10
        print(X[i:i+2].mean(),np.round(left_mse,3),np.round(right_mse,3))
    if lower_mse>mse:
        lower_mse= mse
        best_split.clear()
        best_split['X[0]<=']=X[i:i+2].mean()
print(best_split)

lower_mse = 0.24
best_split={}
#分裂标准
for i in range(0,10):   #i从0开始
    if i ==9:
        mse=((residual0-residual0.mean())**2).mean()
    else:
        left=residual0[:i+1]
        right=residual0[i+1:]
        left_mse= ((left-left.mean())**2).mean()
        right_mse = ((right-right.mean())**2).mean()
        mse = left_mse*(i+1)/10+right_mse*(10-i-1)/10
        print(X[i:i+2].mean(),np.round(left_mse,3),np.round(right_mse,3))
    if lower_mse>mse:
        lower_mse= mse
        best_split.clear()
        best_split['X[0]<=']=X[i:i+2].mean()
print(best_split)

#-0.625
gamma1 = residual0[:8].sum()/((y[:8]-residual0[:8])*(1-y[:8]+residual0[:8])).sum()
gamma1

#右侧节点的预测值
gamma2 = residual0[8:].sum()/((y[8:]-residual0[8:])*(1-y[8:]+residual0[8:])).sum()
gamma2

拟合第二棵树
#第一棵树的负梯度(预测值)
#10个样本
gamma = np.array([gamma1]*8+[gamma2]*2)
gamma

learning_rate = 0.1#学习率
#在第一棵树的基础上,进行梯度提升
F1 = F0+learning_rate*gamma
F1

计算F1的残差

residual1 = y-1/(1+np.exp(-F1))
residual1

#执行到这里,自己算的和GBDT画出来的图形上面数据一致
np.round(((residual1-residual1.mean())**2).mean(),3)

lower_mse = 0.223
beast_split = {}
for i in range(10):
    if i ==9:#9是最后一个值,range(10)取不到10
        mse = np.round(((residual1-residual1.mean())**2).mean(),3)
    else:
        left=residual1[:i+1]#左闭右开
        right=residual1[i+1:]
        left_mse =((left-left.mean())**2).mean()
        right_mse =((right-right.mean())**2).mean()
        mse = left_mse*(i+1)/10*right_mse*(10-i-1)/10
    if lower_mse>mse:
        lower_mse=mse
        best_split.clear()
        best_split['X[0]<='] = X[i:i+2].mean()
    print('从第%d个数据进行分类'%(i+1),np.round(left_mse,3),np.round(right_mse,3))
print('最优的分裂条件是:',best_split)

#计算的是左右两边叶节点的预测值
gammal = residual1[:8].sum ()/(((y[:8]-residual1[:8])*(1-y[:8]+residual1[:8])).sum())
np.round(gammal,3)

gamma2 = residual1[8:].sum ()/(((y[8:]-residual1[8:])*(1-y[8:]+residual1[8:])).sum())
np.round(gamma2,3)

拟合第三棵树
#获取第二棵树的负梯度(叶节点预测值:value)
gamma=np.array([gammal]*8+[gamma2]*2)
gamma

F2=F1+gamma*learning_rate
F2

residual2=y-1/(1+np.exp(-F2))
residual2

np.round(((residual2-residual2.mean())**2).mean(),3)

寻找最佳裂分条件
lower_mse = 0.209
beast_split = {}
for i in range(10):
    if i ==9:#9是最后一个值,range(10)取不到10
        mse = np.round(((residual2-residual2.mean())**2).mean(),3)
    else:
        left=residual2[:i+1]#左闭右开
        right=residual2[i+1:]
        left_mse =((left-left.mean())**2).mean()
        right_mse =((right-right.mean())**2).mean()
        mse = left_mse*(i+1)/10*right_mse*(10-i-1)/10
    if lower_mse>mse:
        lower_mse=mse
        best_split.clear()
        best_split['X[0]<='] = X[i:i+2].mean()
    print('从第%d个数据进行分类'%(i+1),np.round(left_mse,5),np.round(right_mse,5))
print('最优的分裂条件是:',best_split)

预测每个叶节点的预测值

gamma1 = residual2[:3].sum()/(((y[:3]-residual2[:3])*(1-y[:3]+residual2[:3])).sum())
np.round(gammal,3)

gamma2 = residual2[3:].sum()/(((y[3:]-residual2[3:])*(1-y[3:]+residual2[3:])).sum())
np.round(gamma2,3)

预测概率是

#算法预测的概率
proba_ = clf.predict_proba(X)
proba_

#概率转换为类别
proba_.argmax(axis = 1)

clf.predict(X)

#叶节点的value,预测值,负梯度
gamma = np.array([gamma1]*3+[gamma2]*7)
gamma

F3 = F2+learning_rate*gamma
F3

#默认情况下计算的是类别1
#两类:0,1
p= 1/(1+np.exp(-F3))
p

proba_

np.c_[1-p,p]

  • 4
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值