24/8/11算法笔记 AdaBoost自适应提升算法

import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import tree
import graphviz
加载数据
X,y = datasets.load_iris(return_X_y=True)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=1024)
建模
ada = AdaBoostClassifier(n_estimators=3,algorithm='SAMME',learning_rate=1.0)

ada.fit(X_train,y_train)#建模,学习

y_ = ada.predict(X_test)

proba_ = ada.predict_proba(X_test)
print('分类准确率',ada.score(X_test,y_test))
display(y_,proba_)

构建第一棵树

dot_data = tree.export_graphviz(ada[0],filled=True,rounded=True)
graphviz.Source(dot_data)

gini系数
w1 = np.full(shape = 120,fill_value = 1/120)#每个样本的类别

gini = 1

for i in range(3):#三类,0,1,2
    #计算每个类别的概率
    cond = y_train == i #判断类别
    
    p = w1[cond].sum()#这一类样本的权重
    
    gini-=p**2
print(gini)

拆分条件
best_split = {}#放最佳裂分条件
lower_gini = 1 #判断最小的gini系数

#如何划分,分成两部分
for col in range(X_train.shape[1]):#4个特征,选哪个特征进行裂分
    for i in range(len(X_train)-1):#具体特征,选哪个裂分点
        
        X=X_train[:,col].copy()#取出某个特征
        X.sort()#排序,裂分点
        
        #切片,求平均值,裂分点
        split = X[i:i+2].mean()
        
        cond = (X_train[:,col]<=split).ravel()#条件判断
        
        
        left = y_train[cond]
        right = y_train[~cond]
        
        #左右两边的gini系数,分别计算
        gini_left = 0
        gini_right = 0
        
        for target in range(3):
            p1 = (left==target).sum()/left.size #左边类别的概率
            gini_left += p1*(1-p1)
        
            if right.size!=0:
                p2 = (right==target).sum()/right.size
                gini_right +=p2*(1-p2)
        left_p = w1[cond].sum()
        right_p = 1-left_p
        
        gini = left_p*gini_left +right_p*gini_right
        if gini<lower_gini:#根据某个裂分条件,计算的gini是不是最小
            lower_gini = gini
            best_split.clear()
            best_split['X[%d]'%(col)] = split
        elif gini ==lower_gini:
            best_split['X[%d]'%(col)] = split
print(best_split)        

计算误差率
y1_ = ada[0].predict(X_train)#预测结果

print(y1_)
e1 = ((y_train !=y1_)).mean()#误差
print('第一棵树误差率',e1)
print('算法的误差率',ada.estimator_errors_)

计算弱学习器权重

learning_rate = 1.0
num = 3#三分类

a1 = learning_rate * (np.log((1-e1)/e1)+np.log(num-1))

print('手动计算算法权重是',a1)

print('算法返回的分类器权重是',ada.estimator_weights_)

更新权重

w2 = w1*np.exp(a1 * (y_train!=y1_))
w2/= w2.sum()#归一化
w2

构建第二棵树
dot_data = tree.export_graphviz(ada[1],filled=True,rounded=True)
graphviz.Source(dot_data)

for i in range(3):
    cond = y_train ==i
    value = w2[cond].sum()
    print(np.round(value,3))

gini 系数
gini = 1

for i in range(3):#三类,0,1,2
    #计算每个类别的概率
    cond = y_train == i #判断类别
    
    p = w2[cond].sum()#这一类样本的权重
    
    gini-=p**2
print(np.round(gini,3))
0.5

拆分条件

best_split = {}#放最佳裂分条件
lower_gini = 1 #判断最小的gini系数

#如何划分,分成两部分
for col in range(X_train.shape[1]):#4个特征,选哪个特征进行裂分
    for i in range(len(X_train)-1):#具体特征,选哪个裂分点
        
        X=X_train[:,col].copy()#取出某个特征
        X.sort()#排序,裂分点
        
        #切片,求平均值,裂分点
        split = X[i:i+2].mean()
        
        cond = (X_train[:,col]<=split).ravel()#条件判断
        
        #左右两部分
        left = y_train[cond]
        left_w = w2[cond]/w2[cond].sum()#左侧内部权重分布
        
        right = y_train[~cond]
        right_w = w2[~cond]/w2[~cond].sum()#右侧内部权重分布
        
        #左右两边的gini系数,分别计算
        gini_left = 0
        
        gini_right = 0
        
        for target in range(3):
            cond1 = left ==target#类别判定条件
            p1 = left_w[cond1].sum()
            gini_left += p1*(1-p1)
            
            cond2 =right ==target
            p2 = right_w[cond2].sum()
            gini_right +=p2*(1-p2)
            
            
        left_p = w2[cond].sum()
        right_p = 1-left_p
        
        #整合左右两边的gini系数
        gini = left_p*gini_left +right_p*gini_right
        
        #标志判断最小的gini系数
        if gini<lower_gini:#根据某个裂分条件,计算的gini是不是最小
            lower_gini = gini
            best_split.clear()
            best_split['X[%d]'%(col)] = split
        elif gini ==lower_gini:
            best_split['X[%d]'%(col)] = split
print(best_split)        

计算误差率
y2_ = ada[1].predict(X_train)#预测结果

print(y2_)
#e2 = ((y_train !=y2_)).mean()#误差

cond = y_train!=y2_
e2 = w2[cond].sum()

print('第一棵树误差率',e2)
print('算法的误差率',ada.estimator_errors_)

计算弱学习器权重
learning_rate = 1.0
num = 3#三分类

a2 = learning_rate * (np.log((1-e2)/e2)+np.log(num-1))

print('手动计算算法权重是',a2)

print('算法返回的分类器权重是',ada.estimator_weights_)

更新权重

w3 = w2*np.exp(a2 * (y_train!=y2_))
w3/=w3.sum()#归一化
w3

构建第三棵树

dot_data = tree.export_graphviz(ada[2],filled=True,rounded=True)
graphviz.Source(dot_data)

gini系数计算
gini = 1

for i in range(3):#三类,0,1,2
    #计算每个类别的概率
    
    cond = y_train== i #判断类别
    
    p = w3[cond].sum()#这一类样本的权重
    
    gini-=p**2
print(np.round(gini,3))
0.52
拆分条件
best_split = {}#放最佳裂分条件
lower_gini = 1 #判断最小的gini系数

#如何划分,分成两部分
for col in range(X_train.shape[1]):#4个特征,选哪个特征进行裂分
    for i in range(len(X_train)-1):#具体特征,选哪个裂分点
        
        X=X_train[:,col].copy()#取出某个特征
        X.sort()#排序,裂分点
        
        #切片,求平均值,裂分点
        split = X[i:i+2].mean()
        
        cond = (X_train[:,col]<=split).ravel()#条件判断
        
        #左右两部分
        left = y_train[cond]
        left_w = w3[cond]/w3[cond].sum()#左侧内部权重分布
        
        right = y_train[~cond]
        right_w = w3[~cond]/w3[~cond].sum()#右侧内部权重分布
        
        #左右两边的gini系数,分别计算
        gini_left = 0
        gini_right = 0
        
        for target in range(3):
            cond1 = left ==target#类别判定条件
            p1 = left_w[cond1].sum()
            gini_left += p1*(1-p1)
            
            cond2 =right ==target
            p2 = right_w[cond2].sum()
            gini_right +=p2*(1-p2)
            
            
        left_p = w3[cond].sum()
        right_p = 1-left_p
        
        #整合左右两边的gini系数
        gini = left_p*gini_left +right_p*gini_right
        
        #标志判断最小的gini系数
        if gini<lower_gini:#根据某个裂分条件,计算的gini是不是最小
            lower_gini = gini
            best_split.clear()
            best_split['X[%d]'%(col)] = split
        elif gini ==lower_gini:
            best_split['X[%d]'%(col)] = split
print(best_split)        
{'X[3]': 1.65}
y3_ = ada[2].predict(X_train)#预测结果

print(y3_)
#e2 = ((y_train !=y3_)).mean()#误差

cond = y_train!=y3_
e3 = w3[cond].sum()

print('第一棵树误差率',e3)
print('算法的误差率',ada.estimator_errors_)

计算弱学习器权重

learning_rate = 1.0
num = 3#三分类

a3 = learning_rate * (np.log((1-e3)/e3)+np.log(num-1))

print('手动计算算法权重是',a3)

print('算法返回的分类器权重是',ada.estimator_weights_)

概率计算

算法返回的概率
proba_= ada.predict_proba(X_test)[:5]

proba_

自己计算的概率
proba1= (ada[0].predict(X_test) ==np.array([[0],[1],[2]])).T.astype(np.int8)
proba1
proba2 = (ada[1].predict(X_test) ==np.array([[0],[1],[2]])).T.astype(np.int8)
proba2
proba3 = (ada[2].predict(X_test) ==np.array([[0],[1],[2]])).T.astype(np.int8)
proba3
proba = proba1 * a1 + proba2 * a2 + proba3 * a3
proba

归一化
proba /=ada.estimator_weights_.sum()
proba

proba /=(num-1)
proba

softmax概率

手动计算

(np.exp(proba)/(np.exp(proba).sum(axis =1)).reshape(-1,1))[:5]

函数计算

proba_[:20]

  • 8
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值