Task 02 决策树（下）CART分类代码实现(参考datawhale

最新推荐文章于 2023-03-04 17:06:36 发布

小果一粒沙

最新推荐文章于 2023-03-04 17:06:36 发布

阅读量458

点赞数

分类专栏： python 机器学习文章标签：决策树机器学习深度学习

本文链接：https://blog.csdn.net/qq_35167821/article/details/120816213

版权

python 同时被 2 个专栏收录

26 篇文章 3 订阅

订阅专栏

机器学习

16 篇文章 0 订阅

订阅专栏

主要是改写CART回归代码

两点：

分裂节点的标准，从MSE变成了gini
选择输出节点的特征，由原来的取平均变成求最大值

先写一下大致的代码逻辑吧
总的来说

定义节点分类方式
定义一个树
如何生成一棵树
肯定是要进行节点的分裂
判断一个节点分裂还是不分裂 -> 定义分裂的条件
可以分裂之后，怎么办 -> 需要分裂的方式（即如何进行节点的分裂）
如何进行节点的分裂 -> 定义分裂的准则
找到哪个特征进行分裂 -> 利用信息准则进行判断，找到最优的特征和相应的最优切分点
那个特征怎么被分裂？ -> 定义节点分裂的方式
节点分裂方式是 -> 根据给定的特征，找到这个特征在当前节点下，最优的切分点
节点分裂之后，可以得到左右节点的数据集、深度、分割节点使用的特征、分割节点值

准备工作完成，开始建立一棵树啦。
首先给定根节点，给定根节点的深度，根节点的数据集（以后分裂之后的节点也是给定这两个值）
函数体：总的思想（先序遍历）
先判断当前节点是否是叶子节点，是，则return
找到当前节点的最优切分特征，切分之后的左右数据集、分割点的值
递归左节点（深度加一，当前数据集也是经过gini值切分字后的左节点的数据集）
递归右节点（同左节点

建树之前还要有一个准备（初始化最初根节点的数据集，初始化每个特征重要性的值（信息增益比））

上边是建立一棵树的基本base
接下来需要定义一个分类器（既可以训练，也可以预测）
3. 初始化
4. 定义训练函数fit
5. 定义预测函数predict
6. 定义评价函数来判断预测效果（这里默认的是accuracy

前半部分：为建树做准备：

import numpy as np
from collections import Counter

def MSE(y):
    return ((y - y.mean())**2).sum() / y.shape[0]

def gini(y):
    classes = set(y.tolist())
    C_K = [0] * len(set(y.tolist()))
    for i, category in enumerate(classes):
        C_K[i] = len(y[y==category])
    C_K = np.array(C_K)
    return 1 - np.sum((C_K / len(y)) ** 2)
    

class Node:
    def __init__(self, depth, idx):
        self.depth = depth
        self.idx = idx

        self.left = None
        self.right = None
        self.feature = None
        self.pivot = None


class Tree:

    def __init__(self, max_depth):
        self.max_depth = max_depth

        self.X = None
        self.y = None
        self.feature_importances_ = None

    # 没有达到最大深度
    # min_samples_node = 2
    # NOTE：sum用法很巧妙，只计算取值为True的长度
    def _able_to_split(self, node):
        return (node.depth < self.max_depth) & (node.idx.sum() >= 2)

    # 分裂之后的熵
    def _get_inner_split_score(self, to_left, to_right):
        total_num = to_left.sum() + to_right.sum()
        left_val = to_left.sum() / total_num * gini(self.y[to_left])
        right_val = to_right.sum() / total_num * gini(self.y[to_right])
        return left_val + right_val

    # 对每个节点, 对比分裂之后的最大熵
    # 用于找最优切分节点
    # 返回这个特征对应的最小条件熵, 左边的左边的数据集 \ 右边的数据集 \最优切分节点
    def _inner_split(self, col, idx):
        data = self.X[:, col]
        data = list(set(data))
        best_val = np.infty
        for pivot in data[:-1]:
            to_left = (idx==1) & (data<=pivot)
            to_right = (idx==1) & (~to_left)
            if to_left.sum() == 0 or to_left.sum() == idx.sum():
                continue
            Hyx = self._get_inner_split_score(to_left, to_right)
            if best_val > Hyx:
                best_val, best_pivot = Hyx, pivot
                best_to_left, best_to_right = to_left, to_right
        return best_val, best_to_left, best_to_right, best_pivot

    # 找到每个特征的条件熵
    # 最优的条件熵(最小的) , 左边的数据集 \ 右边的数据集 \ 最优的特征 \最优的切分节点
    def _get_conditional_entropy(self, idx):
        best_val = np.infty
        for col in range(self.X.shape[1]):
            Hyx, _idx_left, _idx_right, pivot = self._inner_split(col, idx)
            if best_val > Hyx:
                best_val, idx_left, idx_right = Hyx, _idx_left, _idx_right
                best_feature, best_pivot = col, pivot
        return best_val, idx_left, idx_right, best_feature, best_pivot

    # 开始分类整个树
    def split(self, node):
        # 首先判断本节点是不是符合分裂的条件
        if not self._able_to_split(node):
            return None, None, None, None
        # 计算H(Y)
        # entropy = MSE(self.y[node.idx==1])
        entropy = gini(self.y[node.idx == 1])
        # 计算最小的H(Y|X)
        (
            conditional_entropy,
            idx_left,
            idx_right,
            feature,
            pivot
        ) = self._get_conditional_entropy(node.idx)
        # 计算信息增益G(Y, X)
        info_gain = entropy - conditional_entropy
        # 计算相对信息增益
        relative_gain = node.idx.sum() / self.X.shape[0] * info_gain
        # 更新特征重要性
        self.feature_importances_[feature] += relative_gain
        # 新建左右节点并更新深度
        node.left = Node(node.depth+1, idx_left)
        node.right = Node(node.depth+1, idx_right)
        self.depth = max(node.depth+1, self.depth)
        return idx_left, idx_right, feature, pivot

    # 初始化特征重要性
    # 初始化根节点的特征，根节点的所有idx设置为True, 也就是包含所有的数据
    # idx用于指代这个节点包含数据集中D的哪些节点
    def build_prepare(self):
        self.depth = 0
        self.feature_importances_ = np.zeros(self.X.shape[1])
        self.root = Node(depth=0, idx=np.ones(self.X.shape[0]) == 1)
    
    # dfs（先序遍历）
    def build_node(self, cur_node):
        if cur_node is None:
            return
        idx_left, idx_right, feature, pivot = self.split(cur_node)
        cur_node.feature, cur_node.pivot = feature, pivot
        self.build_node(cur_node.left)
        self.build_node(cur_node.right)

    def build(self):
        self.build_prepare()
        self.build_node(self.root)

    # 递归，学习左右判断节点所落区间的位置
    def _search_prediction(self, node, x):
        if node.left is None and node.right is None:
            # return self.y[node.idx].mean()
            max_cnt_idx = np.argmax(np.bincount(self.y[node.idx]))
            return self.y[node.idx][max_cnt_idx]
            # return Counter(self.y[node.idx]).most_common()

        if x[node.feature] <= node.pivot:
            node = node.left
        else:
            node = node.right
        return self._search_prediction(node, x)

    def predict(self, x):
        return self._search_prediction(self.root, x)

后半部分：构建分类器


# class DecisionTreeRegressor:
class DecisionTreeClassifier:
    """
    max_depth控制最大深度，类功能与sklearn默认参数下的功能实现一致
    """

    def __init__(self, max_depth):
        self.tree = Tree(max_depth=max_depth)

    def fit(self, X, y):
        self.tree.X = X
        self.tree.y = y
        self.tree.build()
        self.feature_importances_ = (
            self.tree.feature_importances_ 
            / self.tree.feature_importances_.sum()
        )
        return self

    def predict(self, X):
        return np.array([self.tree.predict(x) for x in X])

    def evaluate(self, X, y, metric='accuracy'):
        y_predict = self.predict(X)
        if metric == 'accuracy':
            return len(y[y_predict == y]) / len(y)

输入数据，测试效果

# from CART import DecisionTreeRegressor
from CART import DecisionTreeClassifier
# from sklearn.tree import DecisionTreeRegressor as dt
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.datasets import make_regression
from sklearn.datasets import make_classification

import sys
if __name__ == "__main__":

    # 模拟回归数据集
    X, y = make_classification(
        n_samples=400, n_features=10, n_informative=5, random_state=0
    )

    # my_cart = DecisionTreeRegressor(max_depth=2)
    my_cart = DecisionTreeClassifier(max_depth=2)
    my_cart.fit(X, y)
    res1 = my_cart.predict(X)
    print('accuracy of res1: ', len(y[res1 == y]) / len(y))
    importance1 = my_cart.feature_importances_

    sklearn_cart = dt(max_depth=2)
    # print(y[:5])
    # sys.exit()
    sklearn_cart.fit(X, y)
    res2 = sklearn_cart.predict(X)
    print('accuracy of res2: ', len(y[res2 == y]) / len(y))
    importance2 = sklearn_cart.feature_importances_

    # 预测一致的比例
    # print(((res1-res2)<1e-8).mean())
    print(((res1 - res2) ** 2).mean())
    # 特征重要性一致的比例
    print(((importance1-importance2)<1e-8).mean())