adaboost的python代码实现

最新推荐文章于 2022-12-30 10:41:34 发布

HumorousJack

最新推荐文章于 2022-12-30 10:41:34 发布

阅读量713

点赞数

分类专栏：机器学习文章标签： python 机器学习 adaboost GBDT 源码

本文链接：https://blog.csdn.net/kylinat2688/article/details/78030642

版权

机器学习专栏收录该内容

5 篇文章 0 订阅

订阅专栏

实现了下一个adaboost，以二叉树为基分类器，只能处理类别数据，且只能用于分类，需要处理数值数据或用来做数值预测只要在该版本上改改即可。通过数据测试和sklearn的GDBT以及XGBOOST对比了下，f1值如下：

GDBT：0.933678670778

XGBOOST：0.933678670778

自己实现的adaboost：0.905342414024

准确度还是差了点，毕竟还没有加上行抽样、列抽样以、收缩率以及正则等方法。

和大家交流下，测试数据我已经放在附件中，有问题可以留言。

测试数据下载地址:http://archive.ics.uci.edu/ml/machine-learning-databases/car/

代码如下：

# coding=utf8
import sys
import numpy as np
import os

reload(sys)
sys.setdefaultencoding('utf-8')
os.chdir('D:\Study\ML\MLAction')

def cal_info(y,weight):
    """
    calculate information
    :param y:label 1*m array
    :param weight:1*m array
    :return:information value
    """
    if y is None or weight is None:
        return 0
    if y.shape!=weight.shape:
        raise Exception("the lenght of weight is not equal to y")
    weight_sum=np.sum(weight)
    y_class=np.unique(y)
    info=0
    for c in y_class:
        y_weight=weight[y==c]
        p=np.sum(y_weight)*1.0/weight_sum
        info-=p*np.log(p)
    return info

def leaf_val(y,weight):
    """
    get leaf value
    :param y: 1*m array
    :param weight: 1*m array
    :return:
    """
    if y is None or weight is None:
        return None
    y_stata={}
    y_class=np.unique(y)
    for c in y_class:
        y_stata[c]=np.sum(weight[y==c])
    y_stata=sorted(y_stata.items(),key=lambda x:x[1],reverse=True)
    return y_stata[0][0]

def chose_best_split(X,y,weight):
    """
    chose the best feature and value to split
    :param X: m*n array
    :param y: 1*m array
    :param weight: 1*m array
    :return: split_feature,split_value
    """
    if X is None or y is None or weight is None:
        return None
    m,n=X.shape
    best_split_feat=None
    best_split_val=None
    best_info_gain=0
    info_before = cal_info(y, weight)
    weight_sum=np.sum(weight)
    for split_feat in range(n):
        feat_values=np.unique(X[:,split_feat])
        for split_val in feat_values:
            equal_bl = X[:,split_feat] == split_val
            left_y = y[equal_bl]
            right_y = y[~equal_bl]
            left_weight=weight[equal_bl]
            right_weight=weight[~equal_bl]

            info_left = cal_info(left_y,left_weight)
            info_right = cal_info(right_y,right_weight)
            info_gain = info_before -np.sum(left_weight)*1.0/ weight_sum * info_left \
                        - np.sum(right_weight)*1.0 / weight_sum * info_right
            if info_gain > best_info_gain:
                best_info_gain=info_gain
                best_split_feat=split_feat
                best_split_val=split_val
    return best_split_feat,best_split_val

def classfy_tree_train(X,y,weight=None,min_split_gain=1e-6,min_leaf_data=3):
    """
    train a classified tree
    :param X: m*n array
    :param y: 1*m list
    :param weight: 1*m array
    :param min_split_gain:
    :param min_leaf_data:
    :return: tree rootnode {splitfeat:0,splitval:0,left:node,right:node}
    splitval can ben leaf value
    """
    #check input
    if X is None or y is None:
        raise Exception('X is None or y is None')
    m, n = X.shape
    if m!=len(y):
        raise Exception('the length of X is not equal to y')
    if weight is None:
        weight=np.ones(m)
    node = {'splitfeat':None,'splitval':None,'left':None,'right':None}
    if m<min_leaf_data:
        node['splitval']=leaf_val(y,weight)
        return node

    #split and check split
    split_feat,split_val=chose_best_split(X,y,weight)
    if split_feat is None or split_val is None:
        node['splitval']=leaf_val(y,weight)
        return node
    equal_bl=X[:,split_feat] == split_val
    left_X=X[equal_bl]
    right_X=X[~equal_bl]
    left_y = y[equal_bl]
    right_y=y[~equal_bl]
    left_weight = weight[equal_bl]
    right_weight=weight[~equal_bl]

    if left_y.shape[0]<min_leaf_data or right_y.shape[0]<min_leaf_data:
        node['splitval']=leaf_val(y,weight)
        return node

    info_before=cal_info(y,weight)
    info_left=cal_info(left_y,left_weight)
    info_right = cal_info(right_y,right_weight)
    weight_sum=np.sum(weight)
    info_gain=info_before-np.sum(left_weight)*1.0/weight_sum*info_left\
              -np.sum(right_weight)*1.0/weight_sum*info_right
    if info_gain<min_split_gain:
        node['splitval']=leaf_val(y,weight)
        return node

    #create a node
    node['splitfeat']=split_feat
    node['splitval']=split_val
    node['left']=classfy_tree_train(left_X,left_y,left_weight,min_split_gain,min_leaf_data)
    node['right'] = classfy_tree_train(right_X, right_y, right_weight,min_split_gain, min_leaf_data)

    return node

def classfy_tree_onepredict(x,tree_node):
    """
    predict the class of x
    :param x: 1*n array
    :param tree_node: a trained tree node
    :return: class
    """
    if x is None or tree_node is None:
        return None
    if tree_node['splitfeat'] is None:
        return tree_node['splitval']
    split_feat=tree_node['splitfeat']
    if x[split_feat]==tree_node['splitval']:
        return classfy_tree_onepredict(x,tree_node['left'])
    else:
        return classfy_tree_onepredict(x, tree_node['right'])

def classfy_tree_predict(X,tree_node):
    """
    predict the class of X
    :param X: m*n array
    :param tree_node: a trained tree node
    :return: class:1*m list
    """
    m=X.shape[0]
    predict_classses=[]
    for i in range(m):
        pred=classfy_tree_onepredict(X[i],tree_node)
        predict_classses.append(pred)
    return np.array(predict_classses)

def boost_tree_train(X, y,weight=None, min_split_gain=1e-6, min_leaf_data=3, num_iter=10):
    """
    train a number of tree
    :param X: m*n array
    :param y: 1*m array
    :param min_split_gain:
    :param min_leaf_data:
    :param num_iter: the number of tree
    :return: a number of tree [[tree1,weight],[tree2,weight],...]
    """
    #check input
    if X is None or y is None:
        raise Exception('X is None or y is None')
    m, n = X.shape
    if m!=len(y):
        raise Exception('the length of X is not equal to y')
    tree_list=[]
    if weight is None:
        weight = np.ones(m)
    for i in range(num_iter):
        #train basic tree
        tree=classfy_tree_train(X, y, weight, min_split_gain, min_leaf_data)

        #update weight
        predict_classes = classfy_tree_predict(X, tree)
        error=np.sum(predict_classes!=y)*1.0/m
        if error>0:
            tree_weight=0.5*(np.log(1-error)-np.log(error))
        else:
            tree_weight=1e10
        weight=weight*np.exp(-tree_weight*predict_classes*y)
        weight=weight/np.sum(weight)

        #append
        tree_list.append([tree,tree_weight])

    return tree_list

def boost_tree_onepredict(x,tree_list):
    """
    predict the class of x
    :param x: 1*n list
    :param tree_list: boost tree list
    :return: predict class
    """
    predict_classes={}
    for tree,weight in tree_list:
        pred=classfy_tree_onepredict(x,tree)
        predict_classes[pred]=predict_classes.get(pred,0)+weight
    predict_classes=sorted(predict_classes.items(),key=lambda v:v[1],reverse=True)
    return predict_classes[0][0]

def boost_tree_predict(X,tree_list):
    """
    predict the class of X
    :param X: m*n array
    :param tree_list: boost tree list
    :return: class:1*m list
    """
    m=X.shape[0]
    predict_classses=[]
    for i in range(m):
        pred=boost_tree_onepredict(X[i],tree_list)
        predict_classses.append(pred)
    return np.array(predict_classses)

#read data
import pandas as pd
car=pd.read_csv('Ch07/car.data',header=None)
from sklearn.preprocessing import LabelEncoder
for i in range(car.shape[1]):
    car.iloc[:,i]=LabelEncoder().fit_transform(car.iloc[:,i])

#sample
train_size=int(car.shape[0]*0.6)
select_index=np.random.permutation(car.shape[0]-1)
car=car.iloc[select_index,:]

#cut to train and test data
car_train=car.iloc[:train_size,:]
X_train=car_train.iloc[:,:-1]
y_train=car_train.iloc[:,-1]
y_ct=pd.value_counts(y_train)
y_weight=np.sum(y_ct)*1.0/y_ct
weight=y_weight[y_train]#unbalance data,so calculate the weight

car_test=car.iloc[train_size:,:]
X_test=car_test.iloc[:,:-1]
y_test=car_test.iloc[:,-1]

#GDBT
from sklearn.ensemble import GradientBoostingClassifier
ada=GradientBoostingClassifier().fit(X_train,y_train,weight.values)
ada_pred=ada.predict(X_test)
from sklearn.metrics import f1_score
print(f1_score(y_test,ada_pred,average='macro'))
#XGBOOST
from xgboost.sklearn import XGBClassifier
xgb=XGBClassifier(objective='multi:softmax',n_estimators=30).fit(X_train,y_train,weight.values)
xgb_pred=ada.predict(X_test)
print(f1_score(y_test,xgb_pred,average='macro'))
#self-defined adaboost
best_classifys=boost_tree_train(X_train.values,y_train.values,weight.values,num_iter=25)
b_pred=boost_tree_predict(X_test.values,best_classifys)
print(f1_score(y_test,b_pred,average='macro'))