ID3算法实现决策树的构建，打台球数据集，实现全部数据集都用来训练和预测，和实现2/3数据用来训练，1/3的数据集进行预测。

一枚大学牲啊

已于 2023-12-14 16:15:54 修改

阅读量78

点赞数

文章标签：决策树算法机器学习

于 2023-04-07 10:54:17 首次发布

本文链接：https://blog.csdn.net/qq_62615638/article/details/130007183

版权

from math import log
from sklearn.model_selection import train_test_split
#数据导入下边有程序报错，因为数据比较少，直接打上去了
data = [
    {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': 'FALSE', 'play': 'no'},
    {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': 'TRUE',  'play': 'no'},
    {'outlook': 'overcast', 'temperature': 'hot', 'humidity': 'high', 'windy': 'FALSE',  'play': 'yes'},
    {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'high', 'windy': 'FALSE', 'play': 'yes'},
    {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'FALSE',  'play': 'yes'},
    {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'TRUE', 'play': 'no'},
    {'outlook': 'overcast', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'TRUE', 'play': 'yes'},
    {'outlook': 'sunny', 'temperature': 'mild', 'humidity': 'high', 'windy': 'FALSE',  'play': 'no'},
    {'outlook': 'sunny', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'FALSE',  'play': 'yes'},
    {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'normal', 'windy': 'FALSE',  'play': 'yes'},]
#计算数据集的信息熵函数
def calc_entropy(data):
    num_instances = len(data)
    label_counts = {}
    # 统计各个标签的数量
    for instance in data:
        label = instance['play']
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1
    entropy = 0.0
    # 计算熵
    for label in label_counts:
        prob = float(label_counts[label]) / num_instances
        entropy -= prob * log(prob, 2)
    return entropy
#对数据集按照指定属性进行划分
def split_data(data, attribute):
    split_data = {}
    for instance in data:#对于数据集中的每一个实例
        value = instance[attribute]
        if value not in split_data:
            split_data[value] = []
        split_data[value].append(instance)
    return split_data
#选择最佳的划分属性
def choose_attribute(data, attributes):
    entropy = calc_entropy(data)
    best_info_gain = 0.0
    best_attribute = None
    # 遍历所有属性，计算信息增益
    for attribute in attributes:
        split_data_dict = split_data(data, attribute)
        attribute_entropy = 0.0
        for value in split_data_dict:
            gailv = len(split_data_dict[value]) / float(len(data))
            attribute_entropy += gailv * calc_entropy(split_data_dict[value])
        info_gain = entropy - attribute_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_attribute = attribute
    return best_attribute
#递归构建决策树
def create_decision_tree(data, attributes):
    # 如果数据集为空，返回None
    if len(data) == 0:
        return None
    # 统计正负样本的数量
    counts = {}
    for instance in data:
        label = instance['play']
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    # 检查是否全为正样本或负样本，若是，返回决策树叶子节点
    if len(counts) == 1:
        return {'attribute': None, 'class': list(counts.keys())[0]}
    # 如果没有可用属性了，返回正样本数最多的叶子节点
    if len(attributes) == 0:
        max_label = None
        max_count = 0
        for label in counts:
            if counts[label] > max_count:
                max_label = label
                max_count = counts[label]
        return {'attribute': None, 'class': max_label}
    # 选择最佳划分属性
    best_attribute = choose_attribute(data, attributes)
    # 划分数据
    split_data_dict = split_data(data, best_attribute)
    # 构建子结点
    children = {}
    for value in split_data_dict:
        new_attributes = attributes[:]
        new_attributes.remove(best_attribute)
        child = create_decision_tree(split_data_dict[value], new_attributes)
        children[value] = child
    # 返回决策树节点
    return {'attribute': best_attribute, 'children': children}
#对测试数据进行预测
def predict(decision_tree, test_data):
    predictions = []
    for data in test_data:
        node = decision_tree
        while node['attribute'] is not None:
            value = data[node['attribute']]
            if value in node['children']:
                node = node['children'][value]
            else:
                # 处理缺失值
                node = node['children'][None]
        predictions.append(node['class'])
    return predictions
#对预测结果进行评估
def evaluate(predictions, expected):
    correct = sum([1 for i in range(len(predictions)) if predictions[i] == expected[i]])
    accuracy = correct / len(predictions)
    return accuracy
# 列出所有的属性
attributes = ['outlook', 'temperature', 'humidity', 'windy']
# 构建决策树
decision_tree = create_decision_tree(data, attributes)
# 对测试集进行预测和评估
test_data = data[:]
predictions = predict(decision_tree, test_data)
accuracy = evaluate(predictions, [instance['play'] for instance in test_data])
print(decision_tree)
print(predictions)
print(accuracy)
#对数据集进行划分其三分之二进行训练，三分之一进行测试
train_data,test_data1=train_test_split(data,test_size=0.33,random_state=0)
decision_tree1=create_decision_tree(train_data,attributes)
predictions1=predict(decision_tree1,test_data1)
accuracy1=evaluate(predictions1, [instance['play'] for instance in test_data])
print(decision_tree1)
print(predictions1)
print(accuracy1)

一枚大学牲啊

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
1
评论
ID3算法实现决策树的构建，打台球数据集，实现全部数据集都用来训练和预测，和实现2/3数据用来训练，1/3的数据集进行预测。

for instance in data:#对于数据集中的每一个实例。# 检查是否全为正样本或负样本，若是，返回决策树叶子节点。# 如果没有可用属性了，返回正样本数最多的叶子节点。#数据导入下边有程序报错，因为数据比较少，直接打上去了。#对数据集进行划分其三分之二进行训练，三分之一进行测试。# 如果数据集为空，返回None。#对数据集按照指定属性进行划分。# 选择最佳划分属性。# 对测试集进行预测和评估。#计算数据集的信息熵函数。#选择最佳的划分属性。#对测试数据进行预测。#对预测结果进行评估。
复制链接

扫一扫