CART树二元分类算法Python实现

1.数据集划分的衡量标准-gini系数

gini系数衡量一个数据集的纯度,数据集越纯,基尼系数越小。
在构建CART二元分类树时,就使用gini系数的大小来衡量数据集的划分效果。
具体的,在分类问题中,假设有K个类别,第k个类别的概率为 p k p_{k} pk, 则基尼系数的表达式为:
G i n i = ∑ k = 1 K p k ( 1 − p k ) = 1 − ∑ k = 1 K p k 2 Gini=\sum_{k=1}^{K}{p_{k}(1-p_{k})}=1-\sum_{k=1}^{K}{p_{k}^2} Gini=k=1Kpk(1pk)=1k=1Kpk2
对于个给定的样本数据集D,假设有K个类别,第k个类别的数量为 C k C_{k} Ck,则样本D的基尼系数表达式为:
G i n i ( D ) = 1 − ∑ k = 1 K ( ∣ C k ∣ ∣ D ∣ ) 2 Gini(D)=1-\sum_{k=1}^{K}{(\frac{|C_{k}|}{|D|})^2} Gini(D)=1k=1K(DCk)2
特别的,对于样本数据集D,如果根据特征A的某个值a,把D分成子数据集D1和子数据集D2两部分,则在特征A的条件下,D的基尼系数表达式为:
G i n i ( D , A ) = ∣ D 1 ∣ ∣ D ∣ G i n i ( D 1 ) + ∣ D 2 ∣ ∣ D ∣ G i n i ( D 2 ) Gini(D,A)=\frac{|D_{1}|}{|D|}Gini(D_{1})+\frac{|D_{2}|}{|D|}Gini(D_{2}) Gini(D,A)=DD1Gini(D1)+DD2Gini(D2)
进一步了解其他数据集划分衡量指标,请点击信息熵,条件熵,信息增益,信息增益比,基尼系数

1.1二元分类问题的gini系数

对于二元分类问题,标签类别只有2个,设为0,1;假设 p p p表示将训练样本数据集D中分为类别1的概率,则gini系数可表示为:
G i n i ( D ) = 2 ∗ p ∗ ( 1 − p ) Gini(D)=2*p*(1-p) Gini(D)=2p(1p)
python代码:

# dataSet为ndarray类型
def Gini(dataSet):
    m = np.shape(dataSet)[0]
    num_nonzero = np.count_nonzero(dataSet[:, -1])
    p = num_nonzero / m
    return 2 * p * (1 - p)

CART树基本原理

请看这么博文:CART树算法详解

python代码实现二元分类

import numpy as np
import copy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn import tree
import sys
import os
os.environ['PATH'] = os.pathsep + 'C:\\Program Files\\Graphviz 2.44.1\\bin'

class Node:
    """
    树的节点类
    """

    def __init__(self, feature=-1, split_val=None, results=None, left=None, right=None):
        """
        :param feature: 用于切分数据集的特征索引
        :param split_val: 设置切分的值
        :param results: 存储节点的值
        :param left: 左子树
        :param right: 右子树
        """

        self.feature = feature
        self.split_val = split_val
        self.results = results
        self.left = left
        self.right = right



def leaf(dataSet):
    """计算节点的数值
    :param dataSet: {ndarray}训练样本
    :return: 均值
    """

    len = np.shape(np.unique(dataSet[:, -1]))[0]
    if len == 1:
        return int(dataSet[0, -1])
    elif np.count_nonzero(dataSet[:, -1]) > np.shape(dataSet)[0] / 2:
        return 1
    else:
        return 0


def Gini(dataSet):
    m = np.shape(dataSet)[0]
    num_nonzero = np.count_nonzero(dataSet[:, -1])
    p = num_nonzero / m
    return 2 * p * (1 - p)


def split_tree(dataSet, feature, split_val):
    """根据特征feature中的值split_val将数据集data划分为左右子树
    :param data: {list}训练样本
    :param feature: {int}需要划分的特征索引
    :param split_val: {float}指定的划分值
    :return:(set_1, set_2): {tuple} 左右子树的集合
    """

    set_L = dataSet[np.nonzero(dataSet[:, feature] <= split_val)[0], :]
    set_R = dataSet[np.nonzero(dataSet[:, feature] > split_val)[0], :]
    return set_L, set_R


class CART_regression(object):
    """
    CART算法类
    """

    def __init__(self, X, Y, min_sample, min_err, max_height=20):
        """
        :param X: 回归样本数据的特征
        :param Y: 回归样本数据的标签
        :param min_sample: 每个叶节点最少样本数
        :param min_err: 最小损失
        """
        self.X = X
        self.Y = Y
        self.min_sample = min_sample
        self.min_err = min_err
        self.max_height = max_height

    def fit(self):
        """
        构建树
        input:data{list} -- 训练样本
              min_sample{int} -- 叶子节点中最少样本数
              min_err{float} -- 最小的error
        output: node:树的根节点
        """
        # 将样本特征与样本标签合成完整的样本
        data = np.c_[self.X, self.Y]
        # 初始化
        best_err = Gini(data)
        # 存储最佳切分属性及最佳切分点
        bestCriteria = None
        # 存储切分后的两个数据集
        bestSets = None
        # 构建决策树,返回该决策树的根节点
        '''
        生成叶子节点的3个条件:
        1.数据集样本数小于给定的最小样本数
        2.CART树达到给定的最大深度
        3.数据集样本的gini指数小于给定的最小误差
        '''
        if np.shape(data)[0] <= self.min_sample or self.max_height == 1 or best_err <= self.min_err :
            return Node(results=leaf(data))

        # 开始构建CART回归树
        num_feature = np.shape(data[0])[0] - 1
        for feat in range(num_feature):
            val_feat = np.unique(data[:, feat])
            for val in val_feat:
                # 尝试划分
                set_L, set_R = split_tree(data, feat, val)
                if np.shape(set_L)[0] < 2 or np.shape(set_R)[0] < 2:
                    continue
                # 计算划分后的error值
                p = np.shape(set_L)[0] / np.shape(data)[0]
                err_now = p * Gini(set_L) + (1 - p) * Gini(set_R)
                # 更新最新划分
                if err_now < best_err:
                    best_err = err_now
                    bestCriteria = (feat, val)
                    bestSets = (set_L, set_R)
        # 生成左右子树
        left = CART_regression(bestSets[0][:, :-1], bestSets[0][:, -1], self.min_sample, self.min_err, self.max_height-1).fit()
        right = CART_regression(bestSets[1][:, :-1], bestSets[1][:, -1], self.min_sample, self.min_err, self.max_height-1).fit()
        return Node(feature=bestCriteria[0], split_val=bestCriteria[1], left=left, right=right)



def predict(sample, tree):
    f"""对每一个样本sample进行预测
    :param sample: {list}:样本
    :param tree: 训练好的CART回归模型
    :return: results{float} :预测值
    """

    # 叶子节点
    if tree.results is not None:
        return tree.results
    else:
        # 不是叶节点
        val_sample = sample[tree.feature]
        branch = None
        # 选择右子树
        if val_sample > tree.split_val:
            branch = tree.right
        else:
            branch = tree.left
        return predict(sample, branch)


def test(X, tree):
    """评估CART回归模型
    :param X: {list} 测试样本
    :param Y: {list} 测试标签
    :param tree: 训练好的CART回归树模型
    :return:  均方误差
    """

    m = np.shape(X)[0]
    y_hat = []
    for i in range(m):
        pre = predict(X[i], tree)
        y_hat.append(pre)
    return y_hat


def numLeaf(tree):
    if tree.results is not None:
        return 1
    else:
        return numLeaf(tree.left) + numLeaf(tree.right)


def heightTree(tree):
    if tree.results is not None:
        return 1
    else:
        heightL = heightTree(tree.left)
        heughtR = heightTree(tree.right)
        if heightL > heughtR:
            return heightL + 1
        else:
            return heughtR + 1
def showTree(tree):
    node = {}

    if tree.results is None:
        node['feat'] = tree.feature
        node['splitVal'] = tree.split_val
        print(node)
        showTree(tree.left)
        showTree(tree.right)
    else:
        node['value'] = tree.results
        print(node)


if __name__ == '__main__':
    # name of features
    featName = ['Number', 'Plasma', 'Diastolic', 'Triceps', '2-Hour', 'Body', 'Diabetes', 'Age', 'Class']
    path = "D:\\YSA\\dataFile\\xg.csv"
    # read data file
    data = pd.read_csv(path, sep=',', header=0, names=featName)
    # set random seed
    np.random.seed(123)
    # split data into train and test
    X_train, X_test, y_train, y_test = train_test_split(data.iloc[:, :-1].values, data.iloc[:, -1].values,
                                                        test_size=0.2, random_state=123)
    # normalize train
    # X_train = MinMaxScaler().fit_transform(X_train)
    # normalize test
    # X_test = MinMaxScaler().fit_transform(X_test)
    cartTree = CART_regression(X_train, y_train, 10, 0.2, 8).fit()
    print(numLeaf(cartTree))
    print(heightTree(cartTree))
    showTree(cartTree)
    y_hat = test(X_test, cartTree)
    print(accuracy_score(y_test, y_hat))

数据集

百度云链接:
链接:https://pan.baidu.com/s/1ae7X0xLj83zWYrgh7UQZtw
提取码:wno3

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值