大部分都是递归回溯的思想ID3,C4.5的递归思路就不赘述了,直接跳到CART
CART的全拼是 Classification and Regression Tree
建树时以基尼系数为依据
提一嘴熵和基尼系数
条件熵
给定一个训练集D
C4.5法则
1.计算经验熵
1.遍历所有特征,计算条件熵减
此处为i类下的j类条件概率
取最大的下标j作为构建下一子节点的根节点特征
3.递归12步,直至叶结点中所含样本不可再分
4.剪枝是根据每个节点的alpha来算的,剪枝的思想和实现步骤一位叫归零的大哥写的很清楚很清楚,就不造轮子了cart树怎么进行剪枝? - 知乎 (zhihu.com)
#!/usr/bin/python
# cython: language_level=3
# -*- coding:utf-8 -*-
# @filename : script.py
# @package :
# @Time : 2022/9/19 10:46
# @Author : 晴天
# @desc :
import copy
import math
import numpy as np
import pandas as pd
class DecisionTreeNode:
def __init__(self, df_init: pd.DataFrame, dimension: str, children: dict, dimensions: iter, indexes: iter,
entropy=None):
self.df_init = df_init # 输入的原数据n*p矩阵,n为样本个数,p为变量维度
self.dimensions = dimensions # 变量维度
self.indexes = indexes # 每个样本的索引
self.dim = dimension # 当前节点的划分维度
self.children = children # 子节点构成的字典
self.count_nodes = 0
self.entropy = entropy
def compute_entropy(self, node, indexes):
dict_count = {}
for index in indexes:
key_count = node.df_init.loc[index, 'y']
if key_count not in dict_count:
dict_count[key_count] = 0
dict_count[key_count] += 1
entropy_condition = 0
for count in dict_count.values():
p = count / len(indexes)
entropy_condition = entropy_condition - p * np.log(p) * len(indexes) / node.indexes
return entropy_condition
def dfs_node(self, node, epsilon):
entropy_loss = self.compute_entropy(node, node.indexes)
node.entropy = entropy_loss
# 获取除当前节点自身分割维度的其余维度列表
dimensions = node.dimensions.copy()
if node.dim:
dimensions.remove(node.dim)
# 初始化交叉熵值伪-math.inf
entropy_init = -math.inf
dim_init = ''
# 遍历所有剩余维度,按照该维度中不同的变量值生成字典
for dim in node.dimensions:
dict_type = {}
for index in node.indexes:
key_type = node.df_init.loc[index, dim]
if key_type not in dict_type:
dict_type[key_type] = []
dict_type[key_type].append(index)
# 计算按照该维度分割后的交叉熵
entropy_condition_temp = 0
for indexes in dict_type.values():
entropy_condition_temp -= self.compute_entropy(node, indexes)
entropy_loss = entropy_loss - len(indexes) / len(node.indexes)
# 判断当前维度的entropy是否小于entropy_init
if entropy_condition_temp < entropy_init:
entropy_init = entropy_condition_temp
dim_init = dim
# 递归到交叉熵增益小于epsilon的节点时,结束递归
if entropy_loss < epsilon:
return
# 找到最小条件熵的分割维度后,在当前维度对样本进行分割
for index in node.indexes:
key_type = node.df_init.loc[index, dim_init]
if key_type not in node.children:
node.children[key_type] = DecisionTreeNode(df_init=node.df_init, dimension=dim_init, children={},
dimensions=dimensions, indexes=[])
node.children[key_type].indexes.append(index)
# 当前节点的子节点分割完成后,计算每个子节点的熵,并继续对当前节点的子节点进行递归
for node_child in node.children.values():
self.compute_entropy(node_child, node_child.indexes)
self.dfs_node(node_child, epsilon)
# 递归寻找叶节点
def get_node_leaf(self, node, list_leaf_nodes):
self.count_nodes += 1
if not node.children:
list_leaf_nodes.append(node)
for node_child in node.children:
if node_child.children:
self.get_node_leaf(node_child, list_leaf_nodes)
# 计算每个节点的alpha
def compute_alpha(self, node):
list_leaf_nodes = []
self.get_node_leaf(node, list_leaf_nodes)
loss_leaf_nodes = 0
for leaf_node in list_leaf_nodes:
loss_leaf_nodes += leaf_node.entropy
loss_node = self.compute_entropy(node, node.indexes)
alpha = (loss_leaf_nodes - loss_node) / (1 + node.count_ndoes)
node.alpha = alpha
# 创建alpha到决策树节点的映射 int:list[DecisionTreeNode]
def get_dict_alpha2node(self, root):
dict_alpha2node = {}
def dfs(node):
if not node.children:
alpha = 0
if alpha not in dict_alpha2node:
dict_alpha2node[alpha] = []
dict_alpha2node[alpha].append(node)
else:
alpha = self.compute_alpha(node)
if alpha not in dict_alpha2node:
dict_alpha2node[alpha] = []
dict_alpha2node[alpha].append(node)
for node_child in node.children:
dfs(node_child)
dfs(root)
return dict_alpha2node
# 将验证集嵌入构建好的树中
def verify_tree(self, node, test_init: pd.DataFrame, indexes: iter):
# 此处传入的node为初始决策树的copy
# test_init:测试集数据,类型为dataframe
# indexes:测试集在当前节点中的样本索引序号
# 递归到叶节点时,停止递归
node.indexes = indexes
if not node.children:
return
dim = node.dim
for key_child in node.children.keys():
node.children[key_child].indexes = []
for index in indexes:
key_dim = test_init.loc[index, dim]
if key_dim not in node.children.keys():
node.children[key_dim] = DecisionTreeNode(test_init, dim, {}, node.dimensions, [])
node.children[key_dim].indexes.append(index)
# 遍历子节点
for node_child in node.children:
self.verify_tree(node_child, test_init, node_child.indexes)
# 计算验证集组成的树的熵
def compute_entropy_dup(self, node):
list_leaf_nodes = []
self.get_node_leaf(node, list_leaf_nodes)
entropy_dup = 0
for node_leaf in list_leaf_nodes:
entropy_dup += self.compute_entropy(node_leaf, node_leaf.indexes)
return entropy_dup
# 从小到大遍历alpha,对相应节点进行剪枝,找到在验证集中表现最好的子树
# noinspection DuplicatedCode
def node_cut(self, root):
dict_alpha2node = self.get_dict_alpha2node(root)
entropy_dup = math.inf
for alpha in sorted(dict_alpha2node.keys(), reverse=False):
root_dup = copy.deepcopy(root)
dict_alpha2node_dup = self.get_dict_alpha2node(root_dup)
for node_dup in dict_alpha2node_dup[alpha]:
node_dup.children = {}
entropy_dup_temp = self.compute_entropy_dup(root_dup)
if entropy_dup_temp < entropy_dup:
entropy_dup = entropy_dup_temp
root_output = root
return entropy_dup, root_output
def main():
df_init=pd.DataFrame()
root=DecisionTreeNode(df_init,'',{},df_init.columns,df_init.index)
root.dfs_node(root,0.1)
entropy_dup,root_output=root.node_cut(root)
ID3的生成树法则将熵增改为了信息增益比
CART
将熵改为计算根节点基尼系数
都大同小异,就不写了,代码也没经过测试,写了一点点注释,随缘看看