datawhale 10月学习——树模型与集成学习:CART树的实现

前情回顾

  1. 决策树

结论速递

虽然本次助教提供了参考代码,但还是尝试基于自己的认知尝试从头实现了一个CART树。
本次任务自己编程实现了一棵CART树,实现了分类及回归任务,可以实现最大树深度限制。
通过这个任务,对CART树的原理有了进一步的理解。

代码实现

本次实现的CART树可以完成分类及回归任务,可以实现最大树深度限制(最大叶结点树暂时还没有实现),连续属性离散化。

基础库导入及函数定义

定义了gini系数的计算,以及MSE,MAE信息增益的计算

import numpy as np

def _labelcount(data):
    mask = np.unique(data)
    tmp = []
    for each in mask:
        tmp.append(np.sum(data == each))
    p = np.array(tmp) / len(data)
    return mask, p

def _gini(data):
    _, dataK = _labelcount(data)
    gini = 1 - (dataK**2).sum()
    return gini

def _MSE(y, yP):
    return ((y-yP)**2).sum()

def _MAE(y, yP):
    return (np.absolute(y-yP)).sum()

def G_classification(x, y):
    giniY = _gini(y)
    
    mask = np.unique(x)
    tmpGini = []
    count = []
    for each in mask:
        tmpGini.append(_gini(y[x == each]))
        count.append(np.sum(x == each))
    giniYX = (np.array(tmpGini)*np.array(count)/len(x)).sum()
    
    return giniY - giniYX
    
def G_regression(x, y, yNow = None, function = "MSE"):
    if yNow is None:
        yNow = y.mean()
    
    if function == "MSE":
        f = _MSE
    elif function == "MAE":
        f = _MAE
        
    total = f(y,yNow)/len(y)
    
    yLeft = y[x==0].mean()
    left = f(y[x==0],yLeft)/len(y)
    
    yRight = y[x==1].mean()
    right = f(y[x==1],yRight)/len(y)
    
    return total - left - right, yLeft, yRight

树节点类定义

用于存储信息

class TreeNode():
    def __init__(self, leaf = False, nodeLabel = None, depth = 0, father = None):
        self.leaf = leaf
        #self.label = nodeLabel
        #self.feature = None #idx
        #self.divide = None #idx tuple
        #self.left = None
        #self.right = None
        self.father = father
        self.depth = depth

核心树实现

class Tree():
    def __init__(self, max_depth = None, min_node_count = 1, max_leaf_nodes = None, 
                 random_state = 33, splitter = "random",
                 regression = False, metric_type = "MSE"):
        self.type = "regression" if regression else "classification"
        if regression:
            self.metric_type = metric_type
            self.decision_metric = G_regression
        else:
            self.decision_metric = G_classification
                
        self.max_depth = max_depth
        self.min_node_count = min_node_count
        self.max_leaf_nodes = max_leaf_nodes
        self.random_state = random_state
        self.splitter = splitter
        
    def fit(self,x,y):
        self.root = TreeNode(leaf = False, depth = 0)
        self.leaf_node_count = 0
        self._growth(x,y,self.root)
    
    def predict(self,x):
        return self._search_prediction(x,self.root)
    
    def _divide_node(self,x):
        #print(x)
        if len(np.unique(x)) <= len(x)/2:
            #discrete
            choiceLis = np.unique(x)
            divideLis = [(each) for each in choiceLis]
            #divideMotherLis = [each for each in permutations(choiceLis)]
            #divideLis = []
            #for num in range(1,int(np.ceil(len(choiceLis)/2))+1):
            #    divideLis.extend([tuple(sorted(list(each[:num]))) 
            #                      for each in divideMotherLis])
                
            divideLis = list(set(divideLis))
            return divideLis
        else:
            if self.splitter == "random":
                count = max(min(int(np.ceil(len(x)/self.min_node_count)),10),2)
                np.random.seed = self.random_state
                countAct = np.random.randint(2,count+1)
                minVal = x.min()
                maxVal = x.max()
                divideLis = list(np.linspace(minVal,maxVal,countAct))
            else:
                divideLis = list(x)
            return divideLis    
    
    def _growth(self,x,y,node):
        #deep first
        if ((node.depth == self.max_depth) 
            or (len(y) <= self.min_node_count) 
            or (len(np.unique(y)) == 1)):
            node.leaf = True
            self.leaf_node_count += 1
            if self.type == "regression":
                node.label = y.mean()
            else:
                node.label = np.argmax(np.bincount(y.flatten()))
            return None
                
        feature, divide = self._choose_best_split(x,y)
        node.feature = feature
        node.divide = divide
        
        if type(divide) == tuple:
            idx = [each in divide for each in x[:,feature]]
        else:
            idx = [each < divide for each in x[:,feature]]
            
        node.left = TreeNode(leaf = False, father = node, depth = node.depth+1)
        #print(x[idx,:])
        _ = self._growth(x[idx,:],y[idx],node.left)
        
        
        idx = [not each for each in idx]

        node.right = TreeNode(leaf = False, father = node, depth = node.depth+1)
        _ = self._growth(x[idx,:],y[idx],node.right)

    def _choose_best_split(self,x,y):
        divideLis = []
        featureLis = []
        for eachfeature in range(x.shape[1]):
            lis = self._divide_node(x[:,eachfeature])
            divideLis.extend(lis)
            featureLis.extend([eachfeature]*len(lis))
            
        maxind = 0
        maxVal = 0
        for i in range(len(divideLis)):
            divide = divideLis[i]
            feature = featureLis[i]
            try:
                xD = [0 if each in divide else 1 for each in x[:,feature]]
            except:
                xD = [0 if each < divide else 1 for each in x[:,feature]]
            if self.type == "classification":
                new = self.decision_metric(xD, y)
            elif self.type == "regression":
                yNow = y.mean()
                new,_,_ = self.decision_metric(xD, y, yNow, function = self.metric_type)
            maxind = i if new > maxVal else maxind
        
        index = maxind
        divide = divideLis[index]
        feature = featureLis[index]
        return feature, divide
            
    def _search_prediction(self,x,node):
        if node.leaf == True:
            return node.label
        else:
            if type(node.divide) == tuple:
                bo = x[node.feature] in node.divide
            else:
                bo = x[node.feature] < node.divide
            if bo:
                return(self._search_prediction(x,node.left))
            else:
                return(self._search_prediction(x,node.right))

封装

class ClassificationTree():
    def __init__(self, max_depth = None, min_node_count = 1, max_leaf_nodes = None, 
                 random_state = 33, splitter = "random",):
        self.tree = Tree(max_depth = max_depth,
                         min_node_count = min_node_count,
                         max_leaf_nodes = max_leaf_nodes,
                         random_state = random_state,
                         splitter = splitter,
                         regression = False)
        
    def fit(self,x,y):
        self.tree.fit(x,y)
        
    def predict(self,x):
        y = []
        for i in range(x.shape[0]):
            y.append(self.tree.predict(x[i,:]))
        return np.array(y)#[np.newaxis,:]
    
class RegressionTree():
    def __init__(self, max_depth = None, min_node_count = 1, max_leaf_nodes = None, 
                 random_state = 33, splitter = "random", metric_type = "MSE"):
        self.tree = Tree(max_depth = max_depth,
                         min_node_count = min_node_count,
                         max_leaf_nodes = max_leaf_nodes,
                         random_state = random_state,
                         splitter = splitter,
                         regression = True,
                         metric_type = metric_type)
        
    def fit(self,x,y):
        self.tree.fit(x,y)
        
    def predict(self,x):
        y = []
        for i in range(x.shape[0]):
            y.append(self.tree.predict(x[i,:]))
        return np.array(y)[np.newaxis,:]

测试

在Iris数据集上进行了分类训练测试,与sklearn中的决策树效果一致。
在玩具数据集上进行了回归训练测试,与sklearn中的差距较大,待进一步解决。

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

SheltonXiao

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值