datawhale 10月学习——树模型与集成学习：CART树的实现

最新推荐文章于 2022-11-25 19:26:45 发布

SheltonXiao

最新推荐文章于 2022-11-25 19:26:45 发布

阅读量164

点赞数 1

分类专栏：学习文章标签： python 算法深度学习

本文链接：https://blog.csdn.net/qq_40990057/article/details/120818194

版权

学习专栏收录该内容

59 篇文章 15 订阅

订阅专栏

前情回顾

决策树

结论速递

虽然本次助教提供了参考代码，但还是尝试基于自己的认知尝试从头实现了一个CART树。
本次任务自己编程实现了一棵CART树，实现了分类及回归任务，可以实现最大树深度限制。
通过这个任务，对CART树的原理有了进一步的理解。

代码实现

本次实现的CART树可以完成分类及回归任务，可以实现最大树深度限制（最大叶结点树暂时还没有实现），连续属性离散化。

基础库导入及函数定义

定义了gini系数的计算，以及MSE，MAE信息增益的计算

import numpy as np

def _labelcount(data):
    mask = np.unique(data)
    tmp = []
    for each in mask:
        tmp.append(np.sum(data == each))
    p = np.array(tmp) / len(data)
    return mask, p

def _gini(data):
    _, dataK = _labelcount(data)
    gini = 1 - (dataK**2).sum()
    return gini

def _MSE(y, yP):
    return ((y-yP)**2).sum()

def _MAE(y, yP):
    return (np.absolute(y-yP)).sum()

def G_classification(x, y):
    giniY = _gini(y)
    
    mask = np.unique(x)
    tmpGini = []
    count = []
    for each in mask:
        tmpGini.append(_gini(y[x == each]))
        count.append(np.sum(x == each))
    giniYX = (np.array(tmpGini)*np.array(count)/len(x)).sum()
    
    return giniY - giniYX
    
def G_regression(x, y, yNow = None, function = "MSE"):
    if yNow is None:
        yNow = y.mean()
    
    if function == "MSE":
        f = _MSE
    elif function == "MAE":
        f = _MAE
        
    total = f(y,yNow)/len(y)
    
    yLeft = y[x==0].mean()
    left = f(y[x==0],yLeft)/len(y)
    
    yRight = y[x==1].mean()
    right = f(y[x==1],yRight)/len(y)
    
    return total - left - right, yLeft, yRight

树节点类定义

用于存储信息

class TreeNode():
    def __init__(self, leaf = False, nodeLabel = None, depth = 0, father = None):
        self.leaf = leaf
        #self.label = nodeLabel
        #self.feature = None #idx
        #self.divide = None #idx tuple
        #self.left = None
        #self.right = None
        self.father = father
        self.depth = depth

核心树实现

class Tree():
    def __init__(self, max_depth = None, min_node_count = 1, max_leaf_nodes = None, 
                 random_state = 33, splitter = "random",
                 regression = False, metric_type = "MSE"):
        self.type = "regression" if regression else "classification"
        if regression:
            self.metric_type = metric_type
            self.decision_metric = G_regression
        else:
            self.decision_metric = G_classification
                
        self.max_depth = max_depth
        self.min_node_count = min_node_count
        self.max_leaf_nodes = max_leaf_nodes
        self.random_state = random_state
        self.splitter = splitter
        
    def fit(self,x,y):
        self.root = TreeNode(leaf = False, depth = 0)
        self.leaf_node_count = 0
        self._growth(x,y,self.root)
    
    def predict(self,x):
        return self._search_prediction(x,self.root)
    
    def _divide_node(self,x):
        #print(x)
        if len(np.unique(x)) <= len(x)/2:
            #discrete
            choiceLis = np.unique(x)
            divideLis = [(each) for each in choiceLis]
            #divideMotherLis = [each for each in permutations(choiceLis)]
            #divideLis = []
            #for num in range(1,int(np.ceil(len(choiceLis)/2))+1):
            #    divideLis.extend([tuple(sorted(list(each[:num]))) 
            #                      for each in divideMotherLis])
                
            divideLis = list(set(divideLis))
            return divideLis
        else:
            if self.splitter == "random":
                count = max(min(int(np.ceil(len(x)/self.min_node_count)),10),2)
                np.random.seed = self.random_state
                countAct = np.random.randint(2,count+1)
                minVal = x.min()
                maxVal = x.max()
                divideLis = list(np.linspace(minVal,maxVal,countAct))
            else:
                divideLis = list(x)
            return divideLis    
    
    def _growth(self,x,y,node):
        #deep first
        if ((node.depth == self.max_depth) 
            or (len(y) <= self.min_node_count) 
            or (len(np.unique(y)) == 1)):
            node.leaf = True
            self.leaf_node_count += 1
            if self.type == "regression":
                node.label = y.mean()
            else:
                node.label = np.argmax(np.bincount(y.flatten()))
            return None
                
        feature, divide = self._choose_best_split(x,y)
        node.feature = feature
        node.divide = divide
        
        if type(divide) == tuple:
            idx = [each in divide for each in x[:,feature]]
        else:
            idx = [each < divide for each in x[:,feature]]
            
        node.left = TreeNode(leaf = False, father = node, depth = node.depth+1)
        #print(x[idx,:])
        _ = self._growth(x[idx,:],y[idx],node.left)
        
        
        idx = [not each for each in idx]

        node.right = TreeNode(leaf = False, father = node, depth = node.depth+1)
        _ = self._growth(x[idx,:],y[idx],node.right)

    def _choose_best_split(self,x,y):
        divideLis = []
        featureLis = []
        for eachfeature in range(x.shape[1]):
            lis = self._divide_node(x[:,eachfeature])
            divideLis.extend(lis)
            featureLis.extend([eachfeature]*len(lis))
            
        maxind = 0
        maxVal = 0
        for i in range(len(divideLis)):
            divide = divideLis[i]
            feature = featureLis[i]
            try:
                xD = [0 if each in divide else 1 for each in x[:,feature]]
            except:
                xD = [0 if each < divide else 1 for each in x[:,feature]]
            if self.type == "classification":
                new = self.decision_metric(xD, y)
            elif self.type == "regression":
                yNow = y.mean()
                new,_,_ = self.decision_metric(xD, y, yNow, function = self.metric_type)
            maxind = i if new > maxVal else maxind
        
        index = maxind
        divide = divideLis[index]
        feature = featureLis[index]
        return feature, divide
            
    def _search_prediction(self,x,node):
        if node.leaf == True:
            return node.label
        else:
            if type(node.divide) == tuple:
                bo = x[node.feature] in node.divide
            else:
                bo = x[node.feature] < node.divide
            if bo:
                return(self._search_prediction(x,node.left))
            else:
                return(self._search_prediction(x,node.right))

封装

class ClassificationTree():
    def __init__(self, max_depth = None, min_node_count = 1, max_leaf_nodes = None, 
                 random_state = 33, splitter = "random",):
        self.tree = Tree(max_depth = max_depth,
                         min_node_count = min_node_count,
                         max_leaf_nodes = max_leaf_nodes,
                         random_state = random_state,
                         splitter = splitter,
                         regression = False)
        
    def fit(self,x,y):
        self.tree.fit(x,y)
        
    def predict(self,x):
        y = []
        for i in range(x.shape[0]):
            y.append(self.tree.predict(x[i,:]))
        return np.array(y)#[np.newaxis,:]
    
class RegressionTree():
    def __init__(self, max_depth = None, min_node_count = 1, max_leaf_nodes = None, 
                 random_state = 33, splitter = "random", metric_type = "MSE"):
        self.tree = Tree(max_depth = max_depth,
                         min_node_count = min_node_count,
                         max_leaf_nodes = max_leaf_nodes,
                         random_state = random_state,
                         splitter = splitter,
                         regression = True,
                         metric_type = metric_type)
        
    def fit(self,x,y):
        self.tree.fit(x,y)
        
    def predict(self,x):
        y = []
        for i in range(x.shape[0]):
            y.append(self.tree.predict(x[i,:]))
        return np.array(y)[np.newaxis,:]

测试

在Iris数据集上进行了分类训练测试，与sklearn中的决策树效果一致。
在玩具数据集上进行了回归训练测试，与sklearn中的差距较大，待进一步解决。

SheltonXiao

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
datawhale 10月学习——树模型与集成学习：CART树的实现

前情回顾决策树结论速递虽然本次助教提供了参考代码，但还是尝试基于自己的认知尝试从头实现了一个CART树。本次任务自己编程实现了一棵CART树，实现了分类及回归任务，可以实现最大树深度限制。通过这个任务，对CART树的原理有了进一步的理解。本文索引前情回顾结论速递代码实现基础库导入及函数定义树节点类定义核心树实现代码实现本次实现的CART树可以完成分类及回归任务，可以实现最大树深度限制（最大叶结点树暂时还没有实现），连续属性离散化。基础库导入及函数定义定义了gini系数的计算，以及MS
复制链接

扫一扫