Numpy 实现C4.5决策树

C4.5 信息增益比实现决策树

信息增益比

g R ( D , A ) = g ( D , A ) H ( D ) g_{R}(D, A)=\frac{g(D, A)}{H(D)} gR(D,A)=H(D)g(D,A)

其中, g ( D , A ) g(D,A) g(D,A)是信息增益, H ( D ) H(D) H(D)是数据集 D D D的熵

代码实现

import numpy as np

def calculate_entropy(labels):
    # 计算标签的熵
    _, counts = np.unique(labels, return_counts=True)
    probabilities = counts / len(labels)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

def calculate_information_gain(data, labels, feature_index, threshold):
    # 根据给定的特征和阈值划分数据
    left_mask = data[:, feature_index] <= threshold
    right_mask = data[:, feature_index] > threshold
    left_labels = labels[left_mask]
    right_labels = labels[right_mask]

    # 计算左右子集的熵
    left_entropy = calculate_entropy(left_labels)
    right_entropy = calculate_entropy(right_labels)

    # 计算信息增益
    total_entropy = calculate_entropy(labels)
    left_weight = len(left_labels) / len(labels)
    right_weight = len(right_labels) / len(labels)
    information_gain = total_entropy - (left_weight * left_entropy + right_weight * right_entropy)
    return information_gain

def find_best_split(data, labels):
    num_features = data.shape[1]
    best_info_gain = 0
    best_feature_index = -1
    best_threshold = None

    for feature_index in range(num_features):
        feature_values = data[:, feature_index]
        unique_values = np.unique(feature_values)

        for threshold in unique_values:
            info_gain = calculate_information_gain(data, labels, feature_index, threshold)
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_feature_index = feature_index
                best_threshold = threshold

    return best_feature_index, best_threshold

def create_decision_tree(data, labels):
    # 基本情况:如果所有标签都相同,则返回一个叶节点,其中包含该标签
    if len(np.unique(labels)) == 1:
        return {'label': labels[0]}

    # 找到最佳的划分特征
    best_feature_index, best_threshold = find_best_split(data, labels)

    # 创建一个新的内部节点,其中包含最佳特征和阈值
    node = {
        'feature_index': best_feature_index,
        'threshold': best_threshold,
        'left': None,
        'right': None
    }

    # 根据最佳特征和阈值划分数据
    left_mask = data[:, best_feature_index] <= best_threshold
    right_mask = data[:, best_feature_index] > best_threshold
    left_data = data[left_mask]
    left_labels = labels[left_mask]
    right_data = data[right_mask]
    right_labels = labels[right_mask]

    # 递归创建左右子树
    node['left'] = create_decision_tree(left_data, left_labels)
    node['right'] = create_decision_tree(right_data, right_labels)

    return node

def predict(node, sample):
    if 'label' in node:
        return node['label']

    feature_value = sample[node['feature_index']]
    if feature_value <= node['threshold']:
        return predict(node['left'], sample)
    else:
        return predict(node['right'], sample)

# 示例数据集
data = np.array([
    [1, 2, 0],
    [1, 2, 1],
    [1, 3, 1],
    [2, 3, 1],
    [2, 3, 0],
    [2, 2, 0],
    [1, 1, 0],
    [1, 1, 1],
    [2, 1, 1],
    [1, 3, 0]
])

labels = np.array([0, 1, 1, 1, 0, 0, 0, 1, 1, 1])

# 创建决策树
decision_tree = create_decision_tree(data, labels)

# 测试数据
test_data = np.array([
    [1, 2, 0],
    [2, 1, 1],
    [1, 3, 1],
    [2, 3, 0]
])

# 预测结果
for sample in test_data:
    prediction = predict(decision_tree, sample)
    print(f"样本: {sample}, 预测标签: {prediction}")

在这里插入图片描述

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小小程序○

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值