决策树

from math import log

FEATURE = ["不浮出水面是否可以生存", "是否有脚蹼"]


def calculate_entropy(dataset):
    set_num = len(dataset)
    label_statc = {}

    for item in dataset:
        label = item[-1]
        if label not in label_statc.keys():
            label_statc[label] = 0
        label_statc[label] += 1

    entropy = 0
    for label, num in label_statc.items():
        prob = num / set_num
        entropy -= prob * log(prob, 2)

    return entropy


def split_dataset(dataset, index, value):
    new_dataset = []
    for item in dataset:
        if item[index] == value:
            new_dataset.append(item)
    return new_dataset


def determine_feature(dataset):
    feature_num = len(dataset[0]) - 1
    base_entropy = calculate_entropy(dataset)

    best_info_gain = 0
    best_feature_index = None
    for f in range(feature_num):
        feature_attrs = get_dataset_feature_attr(dataset, f)
        cur_entropy = 0
        for fa in feature_attrs:
            fa_dataset = split_dataset(dataset, f, fa)
            cur_entropy += len(fa_dataset) / len(dataset) * calculate_entropy(fa_dataset)
        cur_info_gain = base_entropy - cur_entropy
        if cur_info_gain > best_info_gain:
            best_info_gain = cur_info_gain
            best_feature_index = f
    return best_feature_index


def get_dataset_labels(dataset):
    return set([item[-1] for item in dataset])


def get_dataset_feature_attr(dataset, f):
    return set([item[f] for item in dataset])


def del_dataset_feature(dataset, f):
    new_data_set = []
    for item in dataset:
        new_data_set.append([item[c] for c in item if c != f])
    return new_data_set


def tree(dataset):
    dataset_labels = get_dataset_labels(dataset)
    if len(dataset_labels) == 1:
        return list(dataset_labels)[0]
    if len(dataset[0]) == 1:
        return "maybe"
    f = determine_feature(dataset)
    feature = FEATURE[f]
    cur_tree = {feature: {}}

    feature_attr = get_dataset_feature_attr(dataset, f)
    for fa in feature_attr:
        new_dataset = split_dataset(dataset, f, fa)
        cur_tree[feature][fa] = tree(new_dataset)

    return cur_tree


random_dataset = [["是", "是", "是"], ["是", "是", "是"], ["是", "否", "否"], ["否", "是", "否"], ["否", "是", "否"]]
decision_tree = tree(random_dataset)
print(decision_tree)

这里写图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值