决策树对西瓜数据集进行分类

import math

def create_tree(data, attributes, max_depth=float('inf'), min_samples=1, depth=0):
    if len(data) == 0:
        return None

    if all_same(data):
        return data[0][-1]

    if len(attributes) == 1 or depth >= max_depth:
        return majority_class(data)

    best_attribute = choose_best_attribute(data, attributes)

    tree = {best_attribute: {}}

    attribute_values = get_attribute_values(data, best_attribute)

    for value in attribute_values:

        sub_data = get_sub_data(data, best_attribute, value)

        if len(sub_data) < min_samples:

            tree[best_attribute][value] = majority_class(sub_data)

        else:

            sub_attributes = attributes[:]

            sub_attributes.remove(best_attribute)

            tree[best_attribute][value] = create_tree(sub_data, sub_attributes, max_depth, min_samples, depth + 1)

    return tree


def all_same(data):
    label = data[0][-1]

    for sample in data:

        if sample[-1] != label:
            return False

    return True


def majority_class(data):
    label_count = {}

    for sample in data:

        label = sample[-1]

        if label in label_count:

            label_count[label] += 1

        else:

            label_count[label] = 1

    return max(label_count, key=label_count.get)


def choose_best_attribute(data, attributes):
    best_gain = 0

    best_attribute = None

    for attribute in attributes:

        gain = calculate_gain(data, attribute)

        if gain > best_gain:
            best_gain = gain

            best_attribute = attribute

    return best_attribute


def calculate_gain(data, attribute):
    gain = calculate_entropy(data)

    attribute_values = get_attribute_values(data, attribute)

    for value in attribute_values:
        sub_data = get_sub_data(data, attribute, value)

        prob = len(sub_data) / len(data)

        gain -= prob * calculate_entropy(sub_data)

    return gain


def calculate_entropy(data):
    label_count = {}

    for sample in data:

        label = sample[-1]

        if label in label_count:

            label_count[label] += 1

        else:

            label_count[label] = 1

    entropy = 0

    for count in label_count.values():
        prob = count / len(data)

        entropy -= prob * math.log(prob, 2)

    return entropy


def get_attribute_values(data, attribute):
    values = []

    for sample in data:

        value = sample[attribute]

        if value not in values:
            values.append(value)

    return values


def get_sub_data(data, attribute, value):
    sub_data = []

    for sample in data:

        if sample[attribute] == value:
            sub_data.append(sample)

    return sub_data


def classify(tree, sample):
    if isinstance(tree, str):
        return tree

    root = list(tree.keys())[0]

    value = sample[root]

    subtree = tree[root][value]

    return classify(subtree, sample)


def accuracy(tree, data):
    correct_count = 0

    for sample in data:

        if classify(tree, sample) == sample[-1]:
            correct_count += 1

    return correct_count / len(data)


data1 = [

    ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],

    ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],

    ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],

    ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '好瓜'],

    ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '好瓜'],

    ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],

    ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '坏瓜'],

    ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '坏瓜'],

    ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '坏瓜'],

    ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '坏瓜'],

    ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜']

]

data2 = [

    ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'],

    ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'],

    ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'],

    ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'],

    ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'],

    ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'],

    ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜']

]

attributes = [0, 1, 2, 3, 4, 5]

max_depth = 4

min_samples = 3

tree = create_tree(data1, attributes, max_depth, min_samples)

print("决策树:", tree)

print("决策树分类结果为")

for sample in data2:
    result = classify(tree, sample)

    print(sample, "——", result)

print("决策树分类精度为", accuracy(tree, data2))

决策树: {4: {'凹陷': {0: {'青绿': '好瓜', '乌黑': '好瓜', '浅白': '坏瓜'}}, '稍凹': {2: {'浊响': {0: {'青绿': '好瓜', '乌黑': '好瓜'}}, '沉闷': '坏瓜'}}, '平坦': '坏瓜'}}
决策树分类结果为
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '好瓜'] —— 好瓜
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '好瓜'] —— 坏瓜
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '好瓜'] —— 好瓜
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '坏瓜'] —— 坏瓜
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '坏瓜'] —— 坏瓜
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '坏瓜'] —— 坏瓜
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '坏瓜'] —— 好瓜
决策树分类精度为 0.7142857142857143

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值