【Deep-ML系列】Decision Tree Learning（手写决策树）

py明天会更好

于 2024-08-16 08:21:37 发布

阅读量107

点赞数

分类专栏：深度学习/机器学习文章标签：决策树算法机器学习

本文链接：https://blog.csdn.net/weixin_56029873/article/details/141240337

版权

深度学习/机器学习专栏收录该内容

11 篇文章 0 订阅

订阅专栏

题目链接：Deep-ML

import math
from collections import Counter

"""
    决策树算法：信息熵越低，不确定性越低，被选择的概率越大
"""

def calculate_entropy(labels):
    '''
    计算每个标签的信息熵
    :param labels: 一个列表，里面包含了所有labels标签所对应的values
    :return: 该标签对应的信息熵
    '''
    label_counts = Counter(labels)  # 对出现的每个标签进行计数（字典）
    total_count = len(labels)
    entropy = -sum((count / total_count) * math.log2(count / total_count) for count in label_counts.values())
    return entropy


def calculate_information_gain(examples, attr, target_attr):
    '''
    计算信息增益
    :param examples: 包含示例数据的列表，每个示例都是一个字典，字典的键是属性名。
    :param attr: 要计算信息增益的属性。
    :param target_attr: 要预测的目标属性。
    :return:
    '''
    # 计算目标标签的信息熵
    total_entropy = calculate_entropy([example[target_attr] for example in examples])
    # 获取所有属性的所有唯一值，就是会把每个属性的所有值都给存储起来
    values = set(example[attr] for example in examples)
    attr_entropy = 0   # 初始化属性熵，用于累加每个属性值对应的熵
    for value in values:   # 遍历每个属性的所有可能值
        # 这一步相当于找出所有属性所对应的target_arr值
        value_subset = [example[target_attr] for example in examples if example[attr] == value]
        value_entropy = calculate_entropy(value_subset)
        attr_entropy += (len(value_subset) / len(examples)) * value_entropy
    return total_entropy - attr_entropy


def majority_class(examples, target_attr):
    return Counter([example[target_attr] for example in examples]).most_common(1)[0][0]


def learn_decision_tree(examples, attributes, target_attr):
    '''
    决策树算法
    :param examples: 包含示例数据的列表，每个示例都是一个字典，字典的键是属性名。
    :param attributes: 要计算信息增益的属性。
    :param target_attr: 要预测的目标属性。
    :return:
    '''
    if not examples:
        return 'No examples'
    if all(example[target_attr] == examples[0][target_attr] for example in examples):
        return examples[0][target_attr]
    if not attributes:
        return majority_class(examples, target_attr)

    # 计算每个属性的信息增益
    gains = {attr: calculate_information_gain(examples, attr, target_attr) for attr in attributes}
    # 找到当前信息增益最大的属性，以此来分割
    best_attr = max(gains, key=gains.get)
    # 初始化决策树
    tree = {best_attr: {}}

    for value in set(example[best_attr] for example in examples):
        subset = [example for example in examples if example[best_attr] == value]
        new_attributes = attributes.copy()
        new_attributes.remove(best_attr)
        subtree = learn_decision_tree(subset, new_attributes, target_attr)
        tree[best_attr][value] = subtree

    return tree


if __name__ == '__main__':
    print(learn_decision_tree([
        {'Outlook': 'Sunny', 'Wind': 'Weak', 'PlayTennis': 'No'},
        {'Outlook': 'Overcast', 'Wind': 'Strong', 'PlayTennis': 'Yes'},
        {'Outlook': 'Rain', 'Wind': 'Weak', 'PlayTennis': 'Yes'},
        {'Outlook': 'Sunny', 'Wind': 'Strong', 'PlayTennis': 'No'},
        {'Outlook': 'Sunny', 'Wind': 'Weak', 'PlayTennis': 'Yes'},
        {'Outlook': 'Overcast', 'Wind': 'Weak', 'PlayTennis': 'Yes'},
        {'Outlook': 'Rain', 'Wind': 'Strong', 'PlayTennis': 'No'},
        {'Outlook': 'Rain', 'Wind': 'Weak', 'PlayTennis': 'Yes'}
    ], ['Outlook', 'Wind'], 'PlayTennis'))