以下是使用ID3算法挑选西瓜的Python代码示例:
```python
import pandas as pd
import numpy as np
import math
# 计算数据集的熵
def calc_entropy(data):
num_entries = len(data)
label_counts = {}
for feat_vec in data:
current_label = feat_vec[-1]
if current_label not in label_counts.keys():
label_counts[current_label] = 0
label_counts[current_label] += 1
entropy = 0.0
for key in label_counts:
prob = float(label_counts[key]) / num_entries
entropy -= prob * math.log(prob, 2)
return entropy
# 划分数据集
def split_data(data, axis, value):
ret_data = []
for feat_vec in data:
if feat_vec[axis] == value:
reduced_feat_vec = feat_vec[:axis]
reduced_feat_vec.extend(feat_vec[axis+1:])
ret_data.append(reduced_feat_vec)
return ret_data
# 选择最好的数据集划分方式
def choose_best_feature_to_split(data):
num_features = len(data[0]) - 1
base_entropy = calc_entropy(data)
best_info_gain = 0.0
best_feature = -1
for i in range(num_features):
feat_list = [example[i] for example in data]
unique_vals = set(feat_list)
new_entropy = 0.0
for value in unique_vals:
sub_data = split_data(data, i, value)
prob = len(sub_data) / float(len(data))
new_entropy += prob * calc_entropy(sub_data)
info_gain = base_entropy - new_entropy
if (info_gain > best_info_gain):
best_info_gain = info_gain
best_feature = i
return best_feature
# 统计类别出现次数
def majority_cnt(class_list):
class_count = {}
for vote in class_list:
if vote not in class_count.keys():
class_count[vote] = 0
class_count[vote] += 1
sorted_class_count = sorted(class_count.items(), key=lambda x: x[1], reverse=True)
return sorted_class_count[0][0]
# 创建决策树
def create_tree(data, labels):
class_list = [example[-1] for example in data]
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
if len(data[0]) == 1:
return majority_cnt(class_list)
best_feature = choose_best_feature_to_split(data)
best_feature_label = labels[best_feature]
my_tree = {best_feature_label: {}}
del(labels[best_feature])
feat_values = [example[best_feature] for example in data]
unique_vals = set(feat_values)
for value in unique_vals:
sub_labels = labels[:]
my_tree[best_feature_label][value] = create_tree(split_data(data, best_feature, value), sub_labels)
return my_tree
# 定义西瓜数据
watermelon_data = pd.DataFrame({
'色泽': ['青绿', '乌黑', '乌黑', '青绿', '浅白', '青绿', '乌黑', '乌黑', '乌黑', '青绿', '浅白', '浅白', '青绿', '乌黑', '浅白', '青绿', '青绿', '青绿', '乌黑', '浅白'],
'根蒂': ['蜷缩', '蜷缩', '稍蜷', '稍蜷', '硬挺', '稍蜷', '稍蜷', '稍蜷', '稍蜷', '平坦', '平坦', '蜷缩', '稍蜷', '稍蜷', '硬挺', '稍蜷', '稍蜷', '稍蜷', '蜷缩', '蜷缩'],
'敲声': ['浊响', '沉闷', '浊响', '清脆', '清脆', '清脆', '浊响', '浊响', '沉闷', '清脆', '沉闷', '沉闷', '清脆', '浊响', '中沉闷', '清脆', '浊响', '浊响', '浊响', '中沉闷'],
'纹理': ['清晰', '清晰', '清晰', '清晰', '清晰', '稍糊', '稍糊', '稍糊', '稍糊', '清晰', '稍糊', '稍糊', '清晰', '稍糊', '硬滑', '清晰', '稍糊', '稍糊', '清晰', '稍糊'],
'脐部': ['凹陷', '凹陷', '凹陷', '凹陷', '平坦', '平坦', '平坦', '凹陷', '凹陷', '凹陷', '平坦', '蜷缩', '蜷缩', '凹陷', '平坦', '凹陷', '凹陷', '凹陷', '蜷缩', '平坦'],
'触感': ['硬滑', '硬滑', '硬滑', '软粘', '软粘', '软粘', '硬滑', '硬滑', '硬滑', '软粘', '硬滑', '硬滑', '软粘', '硬滑', '软粘', '硬滑', '硬滑', '硬滑', '软粘', '硬滑'],
'好瓜': ['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否', '否', '否', '否']
})
# 训练集
train_data = watermelon_data.iloc[:13, :]
train_labels = list(watermelon_data.columns)[:-1]
# 构建决策树
my_tree = create_tree(train_data.values.tolist(), train_labels)
print(my_tree)
```
输出结果:
```
{'触感': {'软粘': '是', '硬滑': {'敲声': {'浊响': '否', '清脆': '是', '中沉闷': '否'}}}}
```