CART决策树的Python实现

最新推荐文章于 2024-07-11 09:58:56 发布

WiseDoge

最新推荐文章于 2024-07-11 09:58:56 发布

阅读量3k

点赞数 1

分类专栏：技术文章标签： python 机器学习数据挖掘算法

本文链接：https://blog.csdn.net/WiseDoge/article/details/57077787

版权

技术专栏收录该内容

9 篇文章 0 订阅

订阅专栏

完整的代码请见：https://github.com/WiseDoge/ML-by-Python

CART决策树分类器

from collections import defaultdict
import numpy as np


class TreeNode(object):
    """决策树节点"""

    def __init__(self, **kwargs):
        '''
        attr_index: 属性编号
        attr: 属性值
        label: 类别（y）
        left_chuld: 左子结点
        right_child: 右子节点
        '''
        self.attr_index = kwargs.get('attr_index')
        self.attr = kwargs.get('attr')
        self.label = kwargs.get('label')
        self.left_child = kwargs.get('left_child')
        self.right_child = kwargs.get('right_child')


class DecisionTreeClassifier(object):
    """
    决策树分类器。
    本算法采用的是分类与回归树(classification and regression tree, CART)
    """

    def __init__(self):
        # 决策树根节点
        self.root = None

    def gini(self, cluster):
        '''
        :param cluster: 训练集的一个子集
        :return: 数据集的基尼系数
        求给定数据集的基尼系数
        '''
        p = defaultdict(int)
        for line in cluster:
            p[line[-1]] += 1
        temp = 1.0
        for k, v in p.items():
            temp -= (v / len(cluster)) ** 2

        return temp

    def gini_index(self, cluster, attr_index):
        '''
        :param cluster: 训练集的一个子集
        :param attr_index:  特征编号（第N个特征）
        :return: 第N个特征的特征值， 该值的基尼指数
        返回给定列标号下的最优切分属性和该属性的基尼指数
        '''

        p = defaultdict(list)
        for line in cluster:
            p[line[attr_index]].append(line)
        attr_gini = {}
        for k, v in p.items():
            els = []
            for k1, v1 in p.items():
                if k1 == k:
                    continue
                els.extend(v)
            count = (self.gini(v) * len(v) + self.gini(els) * len(els)) / len(cluster)
            attr_gini[k] = count
        attr = min(attr_gini, key=attr_gini.get)
        return attr, attr_gini[attr]

    def devide_set(self, cluster, index, attr):
        '''
        :param cluster: 给定集合（为训练集的一个子集）
        :param index: 特征编号
        :param attr: 特征值
        :return: 左半部分，右半部分
        将给定集合切分为两部分返回，第index个特征的特征值等于attr的为一组
        不等于attr的为一组。
        '''
        left = []
        right = []
        for line in cluster:
            if line[index] == attr:
                left.append(line)
            else:
                right.append(line)
        return np.array(left), np.array(right)

    def get_best_index(self, cluster, attr_indexs):
        '''
        :param cluster: 给定数据集
        :param attr_indexs: 给定的可供切分的特征编号的集合
        :return: 最佳切分点，最佳切分变量
        求给定切分点集合中的最佳切分点和其对应的最佳切分变量
        '''
        p = {}
        for attr_index in attr_indexs:
            p[attr_index] = (self.gini_index(cluster, attr_index))
        attr_index = min(p, key=lambda x: p.get(x)[1])
        attr = p[attr_index][0]
        return attr_index, attr

    def build_tree(self, cluster, attr_indexs):
        '''
        :param cluster: 给定数据集
        :param attr_indexs: 给定的可供切分的特征编号的集合
        :return: 一个决策树结点
        递归构建决策树
        '''
        flag = cluster[0, -1]
        for i in cluster[:, -1]:
            if i != flag:
                break
        else:
            return TreeNode(label=flag)
        if not attr_indexs:
            p = defaultdict(int)
            for line in cluster:
                p[line[-1]] += 1
            return TreeNode(label=max(p, key=p.get))

        for i in attr_indexs:
            flag = cluster[i][0]
            f = False
            for j in cluster[:, i]:
                if j != flag:
                    f = True
                    break
            if f:
                break
        else:
            p = defaultdict(int)
            for line in cluster:
                p[line[-1]] += 1
            return TreeNode(label=max(p, key=p.get))

        attr_index, attr = self.get_best_index(cluster, attr_indexs)
        left, right = self.devide_set(cluster, attr_index, attr)

        new_attr_indexs = attr_indexs - set([attr_index])

        left_branch = self.build_tree(left, new_attr_indexs)
        right_branch = self.build_tree(right, new_attr_indexs)
        return TreeNode(left_child=left_branch,
                        right_child=right_branch,
                        attr_index=attr_index,
                        attr=attr)

    def fit(self, train_x, train_y):
        '''
        :param train_x: 训练集合X
        :param train_y: 训练集合Y（target）
        :return: None
        拟合决策树
        '''
        attr_indexs = set(range(train_x.shape[1]))
        self.train_x = np.c_[train_x, train_y]
        self.root = self.build_tree(self.train_x, attr_indexs)

    def predict_one(self, x):
        '''
        :param x:  待预测的样本X
        :return: X所属的类别
        预测单个值
        '''
        node_p = self.root
        while node_p.label == None:
            if x[node_p.attr_index] == node_p.attr:
                node_p = node_p.left_child
            else:
                node_p = node_p.right_child
        return node_p.label

    def predict(self, test_x):
        '''
        :param test_x: 测试集
        :return: 测试集样本的类别集合
        预测多个值
        '''
        return np.array([self.predict_one(x) for x in test_x])