使用cart算法进行分类

勿笑葱

已于 2024-02-21 21:02:23 修改

阅读量386

点赞数 7

分类专栏：机器学习文章标签：算法分类数据挖掘

于 2024-01-03 11:07:17 首次发布

本文链接：https://blog.csdn.net/seabyeolbe/article/details/135358641

版权

机器学习专栏收录该内容

9 篇文章 0 订阅

订阅专栏

这里分类的文件是鸢尾花中的第0类和第1类还有第2类。这里使用的是cart算法，求解Gini系数来分类。每一类均用了十个数据来测试，剩余的数据全部拿来做了训练集。测试出来的效果还不错，所以并没有进行剪枝操作。至于理论部分网上有许多优秀的博客可以参考学习。

import numpy as np
import pandas as pd
from typing import List, Self

class Node:
    def __init__(self,
                 feature_index: int | None = None,
                 threshold: float | None = None,
                 left: Self | None = None,
                 right: Self | None = None,
                 value: int | None = None
    ):
        self.feature_index = feature_index  # 用于分割数据的特征索引
        self.threshold = threshold  # 分割阈值
        self.left = left  # 左子树
        self.right = right  # 右子树
        self.value = value  # 叶子节点的预测值

def get_gini(x: np.ndarray[float, float], y: np.ndarray[int]) -> float:
    unique_labels = np.unique(y)
    num_classes = len(unique_labels)
    rows = len(x)

    sorted_data = sorted(zip(x, y), key=lambda pair: pair[0])
    thresholds: list[float] = [(sorted_data[i][0] + sorted_data[i+1][0]) / 2 for i in range(rows-1)]
    gini_values: list[float] = []
    for threshold in thresholds:
        count_left = np.zeros(num_classes)
        count_right = np.zeros(num_classes)

        for i in range(rows):
            if x[i] < threshold:
                index = np.where(unique_labels == y[i])[0][0]
                count_left[index] += 1
            else:
                index = np.where(unique_labels == y[i])[0][0]
                count_right[index] += 1

        p_left = 0 if (left_count := np.sum(count_left)) == 0 else count_left / left_count
        p_right = 0 if (right_count := np.sum(count_right)) == 0 else count_right / right_count
        gini_left = 1 - np.sum(p_left ** 2)
        gini_right = 1 - np.sum(p_right ** 2)
        gini: float = (np.sum(count_left) / rows) * gini_left + (np.sum(count_right) / rows) * gini_right
        gini_values.append(gini)
    min_gini_index = np.argmin(gini_values)
    corresponding_threshold = thresholds[min_gini_index]
    return corresponding_threshold

def build_tree(node: Node, x: np.ndarray[float, float], y: np.ndarray[int], depth: int, max_depth: int):
    if depth >= max_depth - 1 or len(np.unique(y)) == 1:
        node.value = np.bincount(y).argmax()  # 叶子节点的预测值为平均值
        return

    best_feature = 0
    best_threshold = 0
    best_gini = float('inf')
    for i in range(x.shape[1]):
        threshold = get_gini(x[:, i], y)  # 使用基尼系数选择最佳分割点
        if threshold < best_gini:
            best_gini = threshold
            best_feature = i
            best_threshold = threshold

    left_indices: np.ndarray[bool] = x[:, best_feature] < best_threshold
    right_indices = ~left_indices

    node.feature_index = best_feature
    node.threshold = best_threshold

    if left_indices.any():
        node.left = Node()
        build_tree(node.left, x[left_indices, :], y[left_indices], depth+1, max_depth)
    if right_indices.any():
        node.right = Node()
        build_tree(node.right, x[right_indices, :], y[right_indices], depth+1, max_depth)

def predict_tree(node: Node, x: np.ndarray[float]) -> int:
    if node.value is not None:
        return node.value

    if x[node.feature_index] < node.threshold:
        return predict_tree(node.left, x)
    return predict_tree(node.right, x)


data = pd.read_excel("C:/Users/wxc/PycharmProjects/pythonProject4/机器学习/回归/决策树/train.xlsx") #训练集进行训练，求得决策树
x_train = np.array(data.iloc[:, 1:5])
y_train = np.array(data.iloc[:, 6])


tree_root = Node()
build_tree(tree_root, x_train, y_train, 0, 4)  # 递归构建决策树

data1 = pd.read_excel("C:/Users/wxc/PycharmProjects/pythonProject4/机器学习/回归/决策树/test.xlsx",header=None) #测试集进行测试
x_test = np.array(data1.iloc[:, 1:5])
prediction = [predict_tree(tree_root, x) for x in x_test]
y_test = np.array(data1.iloc[:, 6])
print("预测值为:", prediction)
c = 0
for i in range(len(y_test)):
    if y_test[i] == prediction[i]:
        c= c+1
print('准确率')        
print(c/(len(y_test)))