决策树ID3-PlayTennis

最近在看决策树,在B站上看到了一个前辈的讲课视频
讲的非常详细,于是自己手动实现了一下基于ID3的决策树

说来惭愧,我是新手,所以并没有导包,纯原始python写的。也并没有画出最后决策树的构建图。

只是让我对这个决策树更加了解一些,后续学到引入外部包,再说。

#!/usr/bin/env python 
# -*- coding:utf-8 -*-
import math
import numpy as np

# 训练集--构建决策树
data = [
    ['Sunny', 'Hot', 'High', 'Weak', 'No'],
    ['Sunny', 'Hot', 'High', 'Strong', 'No'],
    ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
    ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
    ['Sunny', 'Mild', 'High', 'Weak', 'No'],
    ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
    ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
    ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
    ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Strong', 'No']
]
columns = ['Outlook', 'Temperature', 'Humidity', 'Wind']
columns_index = {
    'Outlook': 0,
    'Temperature': 1,
    'Humidity': 2,
    'Wind': 3,
}


# 第1步计算决策属性的熵
def calculate_entropy(path):
    # print('path', path)
    decision_entropy = 0
    decision_calculate = {}
    filtered_data = []
    for line in data:
        #     if满足条件
        satisfy = True
        for column in path:
            if path[column] != line[columns_index[column]]:
                satisfy = False
                break
        if satisfy:
            filtered_data.append(line)
    # print(filtered_data)
    for line in filtered_data:
        count = decision_calculate.get(line[-1])
        if count is None:
            count = 0
        count += 1
        decision_calculate[line[-1]] = count
    # print(decision_calculate)
    if len(filtered_data) > 0:
        for decision in decision_calculate:
            decision_calculate[decision] /= len(filtered_data) * 1.0
            decision_entropy -= decision_calculate[decision] * math.log(decision_calculate[decision], 2)

    return decision_entropy, filtered_data


# 第2步计算条件属性的熵
# 条件属性共有4个:
# Outlook、 Temperature、 Humidity、 Wind。
# 分别计算不同属性的信息增益。
#     计算Outlook中各个属性的条件熵
#     Outlook共分三个组:
#     Sunny(D1)、Overcast(D2)、 Rain(D3)
#       Sunny
def child_node(parent_score, nodes, node_data, path):
    # node_data 根据 node 分组
    node_dict = {}
    for line in node_data:
        for node in nodes:
            attributes = node_dict.get(node)
            if attributes is None:
                attributes = {}
            attribute = attributes.get(line[columns_index[node]])
            if attribute is None:
                attribute = {}
            num = attribute.get(line[-1])
            if num is None:
                num = 1
            else:
                num += 1
            attribute[line[-1]] = num
            attribute['count'] = 1 if attribute.get('count') is None else attribute.get('count') + 1
            attributes[line[columns_index[node]]] = attribute
            node_dict[node] = attributes

    # print(node_dict)
    # 计算Outlook中各个属性的条件熵
    root = next(iter(node_dict))
    increment = 0

    for node in node_dict:
        node_score = []
        weight = []
        # print('node :', node, end='')
        for attribute in node_dict[node]:
            # print(' attribute :', attribute, end='')
            current_path = path.copy()
            current_path[node] = attribute
            decision_entropy, filtered_data = calculate_entropy(current_path)
            # print(' attribute_score :', decision_entropy)
            node_score.append(decision_entropy)
            weight.append((node_dict[node][attribute]['count'] / len(node_data)))
        # print('node_score', node_score)
        # print('weight', weight)
        node_score = sum(np.multiply(node_score, weight))
        # print('node_score', node_score)
        if parent_score - node_score > increment:
            increment = parent_score - node_score
            root = node
        # print('increment', parent_score - node_score)

    # print('choose :', root, ' increment:', increment)
    return root, node_dict[root]


def find_attribute(root, attributes, path, tree_node):
    # print('choose', root, 'attributes', attributes)
    # print('tree_node', tree_node.name, 'root', root)
    for attribute in attributes:
        # print('attribute... : ', attribute)
        path[root] = attribute
        # find node
        entropy, node_data = calculate_entropy(path)
        # print('filtered_data', node_data)
        # print('entropy', entropy)

        attribute_node = Node(attribute, tree_node, [])
        tree_node.next.append(attribute_node)

        if entropy == 0.0:
            attribute_node.next.append(Node(node_data[-1][-1], next, None))
            # print(node_data[-1][-1])

        elif len(node_data) > 0:
            node, attributes = child_node(entropy, columns, node_data, path)
            # find attribute

            temp_node = Node(node, next, [])
            attribute_node.next.append(temp_node)
            # print('choose', node, 'attributes', attributes)
            find_attribute(node, attributes, path, temp_node)

        path.pop(root)


class Node:
    def __init__(self, name, before, next):
        self.next = next
        self.before = before
        self.name = name


# 决策树构建完成后,进行预测
def predict(root, line):
    if root.next is None:
        print('result', root.name)
    column = root.name
    attribute = line.get(column)
    if attribute is not None:
        for next in root.next:
            if next.name == attribute:
                for next2 in next.next:
                    predict(next2, line)


if __name__ == '__main__':
    # {'Outlook': 'Sunny', 'Temperature': 'Hot'}
    path = {}
    # find node
    entropy, node_data = calculate_entropy(path)
    root, attributes = child_node(entropy, columns, node_data, path)
    head = Node(root, None, [])
    # find attribute
    find_attribute(root, attributes, {}, head)

    test_data = [
        {
            'Temperature': 'Hot',
            'Humidity': 'High',
            'Wind': 'Weak',
            'Outlook': 'Sunny'
        },
        {
            'Outlook': 'Overcast',
            'Temperature': 'Hot',
            'Humidity': 'High',
            'Wind': 'Weak'
        },
    ]
    for line in test_data:
        print(line)
        predict(head, line)
  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用MATLAB实现ID3算法的示例代码: ```matlab % 定义训练数据集 data = [1 0 1 0 1; 1 0 1 1 1; 1 1 1 0 0; 0 0 1 0 1; 0 0 0 0 0; 0 1 1 0 0; 0 1 0 1 0; 0 1 0 1 1]; % 定义属性名称 attribute_names = {'Outlook', 'Temperature', 'Humidity', 'Windy'}; % 定义目标属性名称 target_attribute_name = 'PlayTennis'; % 调用ID3算法构建决策树 tree = id3(data, attribute_names, target_attribute_name); % 定义测试数据集 test_data = [1 0 1 0; 1 0 1 1; 0 1 0 1]; % 对测试数据集进行分类 for i = 1:size(test_data, 1) classification = classify(tree, attribute_names, test_data(i,:)); fprintf('Test data %d: %s\n', i, classification); end ``` 下面是ID3算法和分类函数的实现: ```matlab function tree = id3(data, attribute_names, target_attribute_name) % 获取目标属性的所有可能取 target_attribute = data(:,end); target_attribute_values = unique(target_attribute); % 如果数据集中所有实例的目标属性取相同,则返回单节点决策树 if numel(target_attribute_values) == 1 tree.op = ''; tree.kids = {}; tree.class = target_attribute_values(1); return; end % 如果属性集为空,则返回单节点决策树,以数据集中出现最频繁的目标属性作为该节点的类别 if size(data, 2) == 1 tree.op = ''; tree.kids = {}; tree.class = mode(target_attribute); return; end % 计算每个属性的信息增益 [best_attribute_index, best_attribute_threshold] = choose_best_attribute(data); best_attribute_name = attribute_names{best_attribute_index}; % 构建决策树 tree.op = best_attribute_name; tree.threshold = best_attribute_threshold; tree.kids = {}; % 根据最佳属性和其阈将数据集分割成子集 subsets = split_data(data, best_attribute_index, best_attribute_threshold); % 递归构建子树 for i = 1:numel(subsets) subset = subsets{i}; if isempty(subset) tree.kids{i} = struct('op', '', 'kids', {}, 'class', mode(target_attribute)); else subtree = id3(subset, attribute_names, target_attribute_name); tree.kids{i} = subtree; end end end function [best_attribute_index, best_attribute_threshold] = choose_best_attribute(data) % 计算目标属性的熵 target_attribute = data(:,end); target_attribute_entropy = entropy(target_attribute); % 计算每个属性的信息增益 attributes = 1:size(data,2)-1; information_gains = zeros(numel(attributes),1); thresholds = zeros(numel(attributes), 1); for i = 1:numel(attributes) attribute_index = attributes(i); attribute_values = data(:,attribute_index); [threshold, information_gain] = choose_best_threshold(attribute_values, target_attribute); information_gains(i) = information_gain; thresholds(i) = threshold; end % 选择信息增益最大的属性 [best_information_gain, best_attribute_index] = max(information_gains); best_attribute_threshold = thresholds(best_attribute_index); % 如果没有最佳阈,则取属性的中位数作为阈 if isnan(best_attribute_threshold) best_attribute_values = data(:,best_attribute_index); best_attribute_threshold = median(best_attribute_values); end end function [threshold, information_gain] = choose_best_threshold(attribute_values, target_attribute) % 对属性进行排序 [sorted_attribute_values, indices] = sort(attribute_values); sorted_target_attribute = target_attribute(indices); % 选择最佳阈 threshold = nan; best_information_gain = -inf; for i = 1:numel(sorted_attribute_values)-1 % 计算当前阈下的信息增益 current_threshold = (sorted_attribute_values(i) + sorted_attribute_values(i+1)) / 2; current_information_gain = information_gain(sorted_target_attribute, sorted_attribute_values, current_threshold); % 如果当前信息增益比之前的更好,则更新最佳阈和最佳信息增益 if current_information_gain > best_information_gain threshold = current_threshold; best_information_gain = current_information_gain; end end information_gain = best_information_gain; end function subsets = split_data(data, attribute_index, threshold) % 根据属性和阈将数据集分割成子集 attribute_values = data(:,attribute_index); left_subset_indices = attribute_values <= threshold; right_subset_indices = attribute_values > threshold; % 构建左右子集 left_subset = data(left_subset_indices,:); right_subset = data(right_subset_indices,:); subsets = {left_subset, right_subset}; end function classification = classify(tree, attribute_names, instance) % 遍历决策树,对实例进行分类 while ~isempty(tree.kids) attribute_index = find(strcmp(attribute_names, tree.op)); attribute_value = instance(attribute_index); if attribute_value <= tree.threshold tree = tree.kids{1}; else tree = tree.kids{2}; end end classification = tree.class; end function e = entropy(target_attribute) % 计算目标属性的熵 p = histc(target_attribute, unique(target_attribute)) / numel(target_attribute); p(p == 0) = []; e = -sum(p .* log2(p)); end function ig = information_gain(target_attribute, attribute_values, threshold) % 计算信息增益 n = numel(target_attribute); left_target_attribute = target_attribute(attribute_values <= threshold); right_target_attribute = target_attribute(attribute_values > threshold); left_entropy = entropy(left_target_attribute); right_entropy = entropy(right_target_attribute); p_left = numel(left_target_attribute) / n; p_right = numel(right_target_attribute) / n; ig = entropy(target_attribute) - p_left * left_entropy - p_right * right_entropy; end ``` 这个实现假设输入数据是一个矩阵,其中每行表示一个实例,每列表示一个属性,最后一列是目标属性。目标属性应该是二元的,即只有两个不同的取。属性名称作为一个字符串向量传递,最后一个元素是目标属性名称。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值