ID3算法实现决策树的构建,打台球数据集,实现全部数据集都用来训练和预测,和实现2/3数据用来训练,1/3的数据集进行预测。

from math import log
from sklearn.model_selection import train_test_split
#数据导入下边有程序报错,因为数据比较少,直接打上去了
data = [
    {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': 'FALSE', 'play': 'no'},
    {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'high', 'windy': 'TRUE',  'play': 'no'},
    {'outlook': 'overcast', 'temperature': 'hot', 'humidity': 'high', 'windy': 'FALSE',  'play': 'yes'},
    {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'high', 'windy': 'FALSE', 'play': 'yes'},
    {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'FALSE',  'play': 'yes'},
    {'outlook': 'rainy', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'TRUE', 'play': 'no'},
    {'outlook': 'overcast', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'TRUE', 'play': 'yes'},
    {'outlook': 'sunny', 'temperature': 'mild', 'humidity': 'high', 'windy': 'FALSE',  'play': 'no'},
    {'outlook': 'sunny', 'temperature': 'cool', 'humidity': 'normal', 'windy': 'FALSE',  'play': 'yes'},
    {'outlook': 'rainy', 'temperature': 'mild', 'humidity': 'normal', 'windy': 'FALSE',  'play': 'yes'},]
#计算数据集的信息熵函数
def calc_entropy(data):
    num_instances = len(data)
    label_counts = {}
    # 统计各个标签的数量
    for instance in data:
        label = instance['play']
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1
    entropy = 0.0
    # 计算熵
    for label in label_counts:
        prob = float(label_counts[label]) / num_instances
        entropy -= prob * log(prob, 2)
    return entropy
#对数据集按照指定属性进行划分
def split_data(data, attribute):
    split_data = {}
    for instance in data:#对于数据集中的每一个实例
        value = instance[attribute]
        if value not in split_data:
            split_data[value] = []
        split_data[value].append(instance)
    return split_data
#选择最佳的划分属性
def choose_attribute(data, attributes):
    entropy = calc_entropy(data)
    best_info_gain = 0.0
    best_attribute = None
    # 遍历所有属性,计算信息增益
    for attribute in attributes:
        split_data_dict = split_data(data, attribute)
        attribute_entropy = 0.0
        for value in split_data_dict:
            gailv = len(split_data_dict[value]) / float(len(data))
            attribute_entropy += gailv * calc_entropy(split_data_dict[value])
        info_gain = entropy - attribute_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_attribute = attribute
    return best_attribute
#递归构建决策树
def create_decision_tree(data, attributes):
    # 如果数据集为空,返回None
    if len(data) == 0:
        return None
    # 统计正负样本的数量
    counts = {}
    for instance in data:
        label = instance['play']
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    # 检查是否全为正样本或负样本,若是,返回决策树叶子节点
    if len(counts) == 1:
        return {'attribute': None, 'class': list(counts.keys())[0]}
    # 如果没有可用属性了,返回正样本数最多的叶子节点
    if len(attributes) == 0:
        max_label = None
        max_count = 0
        for label in counts:
            if counts[label] > max_count:
                max_label = label
                max_count = counts[label]
        return {'attribute': None, 'class': max_label}
    # 选择最佳划分属性
    best_attribute = choose_attribute(data, attributes)
    # 划分数据
    split_data_dict = split_data(data, best_attribute)
    # 构建子结点
    children = {}
    for value in split_data_dict:
        new_attributes = attributes[:]
        new_attributes.remove(best_attribute)
        child = create_decision_tree(split_data_dict[value], new_attributes)
        children[value] = child
    # 返回决策树节点
    return {'attribute': best_attribute, 'children': children}
#对测试数据进行预测
def predict(decision_tree, test_data):
    predictions = []
    for data in test_data:
        node = decision_tree
        while node['attribute'] is not None:
            value = data[node['attribute']]
            if value in node['children']:
                node = node['children'][value]
            else:
                # 处理缺失值
                node = node['children'][None]
        predictions.append(node['class'])
    return predictions
#对预测结果进行评估
def evaluate(predictions, expected):
    correct = sum([1 for i in range(len(predictions)) if predictions[i] == expected[i]])
    accuracy = correct / len(predictions)
    return accuracy
# 列出所有的属性
attributes = ['outlook', 'temperature', 'humidity', 'windy']
# 构建决策树
decision_tree = create_decision_tree(data, attributes)
# 对测试集进行预测和评估
test_data = data[:]
predictions = predict(decision_tree, test_data)
accuracy = evaluate(predictions, [instance['play'] for instance in test_data])
print(decision_tree)
print(predictions)
print(accuracy)
#对数据集进行划分其三分之二进行训练,三分之一进行测试
train_data,test_data1=train_test_split(data,test_size=0.33,random_state=0)
decision_tree1=create_decision_tree(train_data,attributes)
predictions1=predict(decision_tree1,test_data1)
accuracy1=evaluate(predictions1, [instance['play'] for instance in test_data])
print(decision_tree1)
print(predictions1)
print(accuracy1)

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值