集成算法之Boosting - Adaboost，基于CART树的Python实现

最新推荐文章于 2021-12-25 02:43:44 发布

Lee_Yu_Rui

最新推荐文章于 2021-12-25 02:43:44 发布

阅读量737

点赞数 2

文章标签： python 机器学习算法

本文链接：https://blog.csdn.net/Lee_Yu_Rui/article/details/107294360

版权

Boosting 族中，大部分是根据前一个学习器的训练效果对样本分布进行调整，再根据新的样本分布训练下一个学习器，样本调整方式不同可以产生Adaboost ， Gradient Boosting，XGboost等

推导过程可以参考以下博客，当然有不懂地方也可以留言交流，这里就给出代码。基学习器是基于CART，Adaboost完全按照推导过程编写，可以对照公式看的很明白

https://www.cnblogs.com/pinard/p/6133937.html

https://www.cnblogs.com/massquantity/p/9063033.html

class weakLearner():
    def __init__(self):
        self.type_feature = None
        self.w = None
    def __Gini(self,y,sample_weight):
        '''
        :param data: 
        :param sample_weight: 特征数据的权重，NaN表示元数据为空值
        :param y: 目标数据
        :return: Gini: 返回该特征的Gini系数
        '''
        ##  根据第一个公式
        K = np.unique(y)

        gini = 1 - np.sum([(np.sum(sample_weight[y == k])/  np.sum(sample_weight)) **2 for k in K])
        
        return gini
    
    def __typeFeature(self,X):
        # 表示特征是否为连续还是离散
        n_sample,n_feature = X.shape
        self.type_feature = []
        ####   特征属性小于10个，认为是离散型数据用0表示，连续性数据用1 表示
        for f_idx in range(n_feature):
            if len(np.unique(X[:, f_idx]))< 10:
                self.type_feature.append(0)
            else:
                self.type_feature.append(1)
        return self.type_feature       

    def __binSplitData(self,X,y,f_idx,f_val):
        '''
        二分类划分数据集
        :param X 划分数据
        :param f_idx: 数据X的第f_idx个特征  X.iloc[:,f_idx]  太慢 用 X.columns[f_idx]
          np.unique(X.at[:,f_idx])可以得到该特征的属性，如 array(['Overcast', 'Rain', 'Sunny'], dtype=object)
        :param f_val: 数据f_idx个特征中的属性值，即上面中的一种 'Overcast'/ 'Rain'/ 'Sunny
        :param type_feature: 离散特征 0 连续特征 1
        :return: 二分后的左右数据子集
        '''
        ### att 数有数据在第f_idx的特征的所有属性,将不等于 f_val 分为一类，其余分为另一类
        ####################    0: 离散类型特征二分方法 1:连续数据   ############################
        att=X[:, f_idx]
        
        if self.type_feature[f_idx]== 0:
            X_left = X[att == f_val]
            X_right = X[att != f_val]
            y_left = y[att == f_val]
            y_right = y[att != f_val]
            weight_left = self.w[att == f_val]
            weight_right = self.w[att != f_val]
        else:
            X_left = X[att <= f_val]
            X_right = X[att >f_val]
            y_left = y[att <= f_val]
            y_right = y[att > f_val]
            weight_left = self.w[att <= f_val]
            weight_right = self.w[att > f_val]
           ## 切分点和样本点的索引
        return X_left, X_right, y_left, y_right, weight_left, weight_right
    
    
    def __bestSplit(self,X,y):
        '''
           
        找到最佳分割特征与特征值
        :param X
        :return: best_f_idx  最佳分割特征 ， best_f_val 特征值
         
        '''
        ini_gain = 1
        n_sample,n_feature = X.shape
        ## 第一个终止条件： 当叶子节点中的样本数小于最小分割值，或者所有样本属于同一类别时，不再分割
        
        ##-------------------------通过不断二分的过程 寻找对于某个特征，的最佳分割点---------------------------
        for f_idx in range(n_feature):
        ##-------------------------如果该特征中的属性个数小于10，则认为是离散数据 type_feature = 0，否则else---------------------------

            if self.type_feature[f_idx] == 0:
                for f_val in np.unique(X[:, f_idx]):
                    ## 当某个特征只有两个类别时，仅仅做一次左右子树的划分，不用重复操作
                    if len(np.unique(X[:, f_idx]))== 2 and f_val == np.unique(X[:, f_idx])[0]:
                        continue

                    else:
                        
                        X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X,y,f_idx,f_val)

                        Gini_after = np.sum(weight_left) * self.__Gini(y_left,weight_left) + np.sum(weight_right) * self.__Gini(y_right,weight_right)
                        if Gini_after > ini_gain: 
                            continue
                        else:
                            ini_gain = Gini_after
                            best_f_idx,best_f_val = f_idx,f_val
        ##-------------------------     连续特征属性的二分 case = 1   ---------------------------
            else:
                for f_val in np.linspace(np.nanmin(X[:, f_idx])+1,np.nanmax(X[:, f_idx])-1,num=50):
                        X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X,y,f_idx,f_val)
                        Gini_after = np.sum(weight_left) * self.__Gini(y_left,weight_left) + np.sum(weight_right) * self.__Gini(y_right,weight_right)
                    
                        if Gini_after > ini_gain: 
                            continue
                        else:
                            ini_gain = Gini_after
                            best_f_idx,best_f_val = f_idx,f_val
                        

        return best_f_idx,best_f_val
    
    def __CART(self,X,y):
        '''
        生成CART树
        :param X： 特征数据
        :param y: 目标数据
        :return; CART 树
        '''
        best_f_idx, best_f_val = self.__bestSplit(X,y)
        tree = dict()
        tree['cut_f'] = best_f_idx
        tree['cut_val'] = best_f_val
        X_left, X_right, y_left, y_right, weight_left, weight_right = self.__binSplitData(X,y,best_f_idx,best_f_val)
        tree['left_tree'] = y_left
        tree['right_tree'] = y_right
        tree['left_weight'] = weight_left
        tree['right_weight'] = weight_right
        return tree  
    
    
    def train(self,X,y,sample_weight):
        self.w = sample_weight
        self.type_feature = self.__typeFeature(X) 
        self.tree = self.__CART(X,y)
        return self.tree
        
        
    def predict(self,X_test):
        return np.array([self.__predict_one(x_test, self.tree) for x_test in X_test])
    
    def __predict_one(self,x_test,tree):
            cut_f_idx, cut_val = tree['cut_f'], tree['cut_val']
            label_left = Counter(tree['left_tree']).most_common(1)[0][0]
            label_right = Counter(tree['right_tree']).most_common(1)[0][0]
            if self.type_feature[cut_f_idx] == 0:
                result = label_left if x_test[cut_f_idx] == cut_val else label_right
            else:
                result = label_left if x_test[cut_f_idx] <= cut_val else label_right
            return result

class Adaboost():
    def __init__(self,estimators: int = 10, classifier = weakLearner):
        self.estimators = estimators
        self.w = None 
        self.alphas = []
        self.stumps = []
        self.weakLearner = classifier
        
    def fit(self,X,y):
        self.w = np.array([1 / len(X)] * len(X))
        M = self.estimators
        for m in range(M):
            G_m = self.weakLearner()
            tree = G_m.train(X,y,self.w)
            ###'cut_f'   'cut_val'  'left_tree'   'right_tree',  'left_weight'   'right_weight',
            label_left = np.array([Counter(tree['left_tree']).most_common(1)[0][0]] * len(tree['left_tree']) ) 
            label_right = np.array([Counter(tree['right_tree']).most_common(1)[0][0]] * len(tree['right_tree'])) 
            
            error = 1e-6 + np.sum(tree['left_weight'] * (tree['left_tree'] != label_left)) + np.sum(tree['right_weight'] * (tree['right_tree'] != label_right))
            alpha = 1/2 * np.log((1-error)/error) 
            y_temp = np.hstack((tree['left_tree'],tree['right_tree']))
            G = np.hstack((label_left,label_right))                     
            Zm = np.sum(self.w * np.exp(- y_temp * G  * alpha))
            #Zm = 2 * np.sqrt(error * (1-error))
            self.w = self.w * np.exp(- y_temp * G  * alpha)
            self.stumps.append(G_m)
            self.alphas.append(alpha)
    def predict(self,X_test):
                                    
        M = self.estimators
        y_ = 0
        for m in range(M):
            y_ += self.alphas[m] * self.stumps[m].predict(X_test)
        return np.sign(y_)

分别用鸢尾花数据和癌症数据测试算法，鸢尾花数据太简单了，预测精度太高，这里就用癌症数据

if __name__ == '__main__':
    from collections import  Counter
    from sklearn import datasets
    import  numpy as np    
    from sklearn.model_selection import train_test_split
    #iris = datasets.load_iris()
    data = datasets.load_breast_cancer()
    data.target[data.target > 0] = 1
    data.target[data.target == 0] = -1
    X, Y = data.data, data.target
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
    tree_clf = Adaboost()
    tree = tree_clf.fit(X_train,Y_train)
    Y_pred = tree_clf.predict(X_test)
    print('acc:{}'.format(np.sum(Y_pred == Y_test) / len(Y_test)))

也可以参考Github https://github.com/tingting417/Adaboost-based-on-CART/blob/master/Adaboost.ipynb