import pandas as pd
from math import log
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
# 计算香农熵
def Ent(dataset):
n = len(dataset)
label_counts = {}
for item in dataset:
label_current = item[-1]
if label_current not in label_counts.keys():
label_counts[label_current] = 0
label_counts[label_current] += 1
ent = 0.0
for key in label_counts:
prob = label_counts[key] / n
ent -= prob * log(prob, 2)
return ent
# 按照权重计算各分支的信息熵
def sum_weight(grouped, total_len):
weight = len(grouped) / total_len
return weight * Ent(grouped.iloc[:, -1])
# 根据公式计算信息增益
def Gain(column, data):
lenth = len(data)
ent_sum = data.groupby(column).apply(lambda x: sum_weight(x, lenth)).sum()
ent_D = Ent(data.iloc[:, -1])
return ent_D - ent_sum
# 计算获取最大的信息增益的feature,输入data是一个dataframe,返回是一个字符串
def get_max_gain(data):
max_gain = 0
cols = data.columns[:-1]
for col in cols:
gain = Gain(col, data)
if gain > max_gain:
max_gain = gain
max_label = col
return max_label
# 获取data中最多的类别作为节点分类,输入一个series,返回一个索引值,为字符串
def get_most_label(label_list):
return label_list.value_counts().idxmax()
# 创建决策树,传入的是一个dataframe,最后一列为label
def TreeGenerate(data):
feature = data.columns[:-1]
label_list = data.iloc[:, -1]
# 如果样本全属于同一类别C,将此节点标记为C类叶节点
if len(pd.unique(label_list)) == 1:
return label_list.values[0]
# 如果待划分的属性集A为空,或者样本在属性A上取值相同,则把该节点作为叶节点,并标记为样本数最多的分类
elif len(feature) == 0 or len(data.loc[:, feature].drop_duplicates()) == 1:
return get_most_label(label_list)
# 从A中选择最优划分属性
best_attr = get_max_gain(data)
tree = {best_attr: {}}
# 对于最优划分属性的每个属性值,生成一个分支
for attr, gb_data in data.groupby(by=best_attr):
if len(gb_data) == 0:
tree[best_attr][attr] = get_most_label(label_list)
else:
# 在data中去掉已划分的属性
new_data = gb_data.drop(best_attr, axis=1)
# 递归构造决策树
tree[best_attr][attr] = TreeGenerate(new_data)
return tree
# 使用递归函数进行分类
def tree_predict(tree, data):
feature = list(tree.keys())[0]
temp_label = data[feature]
next_tree = tree[feature][temp_label]
if type(next_tree) == str:
return next_tree
else:
return tree_predict(next_tree, data)
if __name__ == '__main__':
dataset = pd.read_csv('play_tennis.csv')
data = dataset.iloc[:, 1:-1]
label = dataset.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2,
random_state=1)
train = pd.concat([X_train, y_train], axis=1)
# 训练模型
decision_tree = TreeGenerate(train)
# 预测并计算准确率
y_predict = X_test.apply(lambda x: tree_predict(decision_tree, x), axis=1)
score = accuracy_score(y_test, y_predict)
print(score)
数据集:play_tennis.csv 已上传到CSDN文件中