决策树部分代码实现
因为本人代码水平较低,实操做出和西瓜书一样的结果有些高兴,因此记录一下自己写的垃圾代码~
数据集
数据集是西瓜2.0.txt,要求好瓜属性,即label一定放在最后面
编号,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,是
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,是
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,是
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,是
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,否
10,青绿,硬挺,清脆,清晰,平坦,软粘,否
11,浅白,硬挺,清脆,模糊,平坦,硬滑,否
12,浅白,蜷缩,浊响,模糊,平坦,软粘,否
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,否
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,否
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,否
16,浅白,蜷缩,浊响,模糊,平坦,硬滑,否
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,否
伪代码
西瓜书上的算法。
从别人的博客copy的图。
代码
只实现了基础的ID3决策树,剪枝、连续值和缺失值处理没写,可能也不会写了。学长说不如看两篇论文,复现代码……
import math
from enum import Enum
from typing import List
import numpy as np
import pandas as pd
original_column_size = []
### 前提的前提:‘好瓜’属性在最后一列
def read_data():
file_path = r'数据集路径'
df = pd.read_csv(file_path)
df.pop('编号')
pos = 0
original_column_size.clear()
for column in df.columns:
tmp = pos
pos += len(set(df[column]))
original_column_size.append((tmp, pos))
df = pd.get_dummies(df, columns=df.columns)
return df
def get_best_attr_ID3(df: pd.DataFrame, column_size):
"""
ID3决策树,将信息熵作为决策树判据
:param df: 独热编码形式数据
:param column_size: 代表每个属性的元素个数
:return: [该层应该选择的划分属性, 该属性计算出的信息熵]
"""
EntD = 0
D_size = len(df)
# 总信息熵
for i in range(column_size[-1][0], len(df.columns)):
probability = df.iloc[:, [i]].sum() / D_size
probability = probability[0]
if probability == 0:
continue
EntD -= probability * math.log2(probability)
# 分别计算每一类的信息熵
max_attr_id, ent = 0, 0
for attr_id in range(len(column_size) - 1):
tmp = 0
for i in range(column_size[attr_id][0], column_size[attr_id][1]):
# 统计同时满足属性positive和类别positive的样本个数
positive = len(df[(df.iloc[:, i] == 1) & (df.iloc[:, -1] == 1)])
cnt = len(df[df.iloc[:, i] == 1])
if cnt == 0:
continue
positive /= cnt
negative = 1 - positive
if positive == 0 or negative == 0:
continue
tmp += (positive * math.log2(positive) + negative * math.log2(negative)) * cnt / D_size
tmp += EntD
if tmp > ent:
max_attr_id = column_size[attr_id]
ent = tmp
return max_attr_id, ent
# get_best_attr_ID3(read_data(), original_column_size)
class TreeNodeTypeEnum(Enum):
good_melon = True
bad_melon = False
class DecisionTreeNode:
def __init__(self, is_leaf=False, melon_type: TreeNodeTypeEnum = TreeNodeTypeEnum.good_melon):
self.children = []
self.children_attributes = []
self.leaf_node_type = melon_type
self.is_leaf_node = is_leaf
def check_df_identical(df: pd.DataFrame, active_columns_pairs):
"""
判断df内的每一行是否完全相同(除了index和是否为好瓜)
:param df:
:param active_columns_pairs: 目前仍然没用过的属性id,数据形式为[(left, right),]
:return: bool
"""
active_columns = []
for pair in active_columns_pairs:
active_columns.extend(range(pair[0], pair[1]))
base = df.iloc[0: 1, active_columns].values
i = 1
while i < len(df):
if not np.array_equal(base, df.iloc[i, active_columns].values):
return False
i += 1
return True
def generate_decision_tree(training_set: pd.DataFrame, active_columns_pairs: List = None):
if len(training_set) == 0:
return None
# 如果全是某一类,那么这个点是叶节点
if (num_positive := training_set.iloc[:, [-1]].sum()[0]) == 0 or num_positive == len(training_set):
if training_set.iat[0, -1] == 1:
# 好瓜
return DecisionTreeNode(True)
else:
return DecisionTreeNode(True, TreeNodeTypeEnum.bad_melon)
# 如果属性集合A为空,或者属性全部相同,则该节点为 个数最多的那个类别 的叶子节点
if len(active_columns_pairs) == 1 or check_df_identical(training_set, active_columns_pairs):
if 2 * num_positive > len(training_set):
# 好瓜数量更多,因此此节点判断为好瓜
return DecisionTreeNode(True)
else:
return DecisionTreeNode(True, TreeNodeTypeEnum.bad_melon)
if len(active_columns_pairs) == 4:
i = 1
# 选择最优属性
used_attribute_ids, information_entropy = get_best_attr_ID3(training_set, active_columns_pairs)
# 构建非叶子节点
return_node = DecisionTreeNode()
active_columns_pairs.remove(used_attribute_ids)
for attr_id in range(used_attribute_ids[0], used_attribute_ids[1]):
# 获得独热编码情况下满足属性training_set.columns[attr_id]的子集
sub_set = training_set[training_set.iloc[:, attr_id] == 1]
# 如果Dv为空,则令该子节点为D中最多类别的节点
if len(sub_set) == 0:
if 2 * num_positive > len(training_set):
son_node = DecisionTreeNode(True)
else:
son_node = DecisionTreeNode(True, TreeNodeTypeEnum.bad_melon)
else:
# 否则递归生成决策树
son_node = generate_decision_tree(sub_set, active_columns_pairs.copy())
return_node.children.append(son_node)
return_node.children_attributes.append(training_set.columns[attr_id])
return return_node
if __name__ == '__main__':
root = generate_decision_tree(read_data(), original_column_size)
i = 1
结果
最后的结果应该就是