1、概述
通过递归方式构造决策树。构造了decisionTreeNode、decisionTree两个类。
ID3.py
├── decisionTreeNode
│ ├── __init__()
│ │ ├──fea_list
│ │ ├──dataset
│ │ ├──split_fea
│ │ ├──first_descendant
│ │ ├──brother
│ │ ├──cls
│ │ └──split_fea_value
│ ├── set_split_fea()
│ ├── set_first_descendant()
│ ├── set_brother()
│ ├── set_cls()
│ └── set_split_fea_value()
└── decisionTree
├── __init__()
│ └──root
├── set_current_node()
├── get_root()
├── build_tree()
└──cal_info_gains()
在decisionTreeNode中,self.fea_list表示当前节点的特征集合;self.dataset为当前节点的样本集合;self.split_fea为在当前节点可以进行划分的情况下,信息增益最大的划分特征,否则为None;self.first_descendant 为当前节点的第一个子节点;self.brother = brother当前节点的兄弟节点;self.cls,如果当前节点为叶子节点,self.cls为类别,否则为None;self.split_fea_value,父节点划分特征对应的特征值。
在decisionTree中,self.root是决策树的根节点;cal_info_gains计算信息增益;build_tree建立决策树
2、算法(build_tree)
输入:根节点(decisionTreeNode对象)
输出:决策树T
1、如果当前节点为None,return
2、如果当前节点的dataset中样本属于同一类或者fea_list为空集,设置当前节点的cls。
3、否则,通过cal_info_gains计算(fea, infogain)对,并获取当前节点dataset划分信息增益最大的特征。通过该特征划分当前点的dataset,给当前点的first_descendant赋值。循环,给first_descendant的brother赋予剩下兄弟节点。
4、如果当前节点brother不为None,递归调用函数。传入当前节点的brother。
5、如果当前节点first_descendant不为None,递归调用算法。传入当前节点的first_descentdant。
3、代码
# _*_coding:UTF-8_*_
import math
import numpy as np
import pandas as pd
#fea_name = {0:"cs", 1:"cac", 2:"chg", 3:"c_social", 4:"aupn_u", 5:"use_ep_m", 6:"use_ep", 7:"satm"}
fea_name = {0:"age", 1:"work", 2:"house", 3:"loan"}
class decisionTreeNode:
# cls of inner nodes is None, \
#but cls of leaf nodes is the concrete category with the largest percent
def __init__(self, fea_list, dataset, split_fea=None, first_descendant=None, brother=None, cls=None, split_fea_value=None):
self.fea_list = fea_list
self.dataset = dataset
self.split_fea = split_fea
self.first_descendant = first_descendant
self.brother = brother
self.cls = cls
self.split_fea_value = split_fea_value
def set_split_fea(self, split_fea):
self.split_fea = split_fea
def set_first_descendant(first_descendant):
self.first_descendant = first_descendant
def set_brother(brother):
self.brother = brother
def set_cls(self, cls):
self.cls = cls
def set_split_fea_value(split_fea_value):
self.split_fea_value = split_fea_value
class decisionTree:
def __init__(self, dataset):
if dataset.size>0:
if dataset.ndim != 2:
print("data format isn't supported!")
else:
self.root = decisionTreeNode(np.array([x for x in range(dataset.shape[1]-1)]), dataset, split_fea_value='root')
else:
self.root = None
def set_current_node(current_node):
self.current_node = current_node
def get_root(self):
return self.root
def build_tree(self, current_node):
if not current_node:
return
if np.unique(current_node.dataset[:, -1]).size == 1 or current_node.fea_list.size==0:
if np.unique(current_node.dataset[:, -1]).size == 1:
current_node.set_cls(np.unique(current_node.dataset[:, -1])[0])
else:
cls_count = np.array([[x, current_node.dataset[current_node.dataset[:,-1]==x].shape[0]] \
for x in np.unique(current_node.dataset[:, -1])])
current_node.cls = cls_count[cls_count[:,-1]==max(cls_count[:, -1])][0][0]
else:
#find the best splitting fea
fea_infogain = np.array([[x, self.cal_info_gains(current_node.dataset, x)] for x in current_node.fea_list])
#think of the situation where more than one feature info gain are equal to the max value
if max(fea_infogain[:,-1])>0:
current_spli_fea = int(fea_infogain[fea_infogain[:,-1]==max(fea_infogain[:,-1])][0][0])
current_node.set_split_fea(current_spli_fea)
descendant_fea_list = current_node.fea_list[current_node.fea_list!=current_spli_fea]
diff_fea_value = np.unique(current_node.dataset[:, current_spli_fea])
current_node.first_descendant = decisionTreeNode(descendant_fea_list, \
current_node.dataset[ \
current_node.dataset[:, current_node.split_fea]==diff_fea_value[0]], \
split_fea_value=diff_fea_value[0])
tmp = current_node.first_descendant
for x in diff_fea_value[1:]:
tmp.brother = decisionTreeNode(descendant_fea_list, current_node.dataset[ \
current_node.dataset[:, current_node.split_fea]==x],\
split_fea_value=x)
tmp = tmp.brother
else:
current_node.set_cls(np.unique(current_node.dataset[:, -1])[0])
if current_node.brother:
self.build_tree(current_node.brother)
if current_node.first_descendant:
self.build_tree(current_node.first_descendant)
def cal_info_gains(self, dataset, split_col):
cls = dataset[:, -1]
#remove duplicate elements
unique_cls = np.unique(cls)
#count the number of different
cls_count = np.array([cls[cls==x].size for x in unique_cls])
cls_count = cls_count/dataset.shape[0]
#calculate dataset entrop
dataset_entrop = np.sum((-1*cls_count) * (np.log2(cls_count)))
#conditonal entrop
splitcol_cls = dataset[:, [split_col, -1]]
unique_col = np.unique(dataset[:, split_col])
dik_d = np.array([[splitcol_cls[splitcol_cls[:,0]==y][:,-1][splitcol_cls[splitcol_cls[:,0]==y][:,-1]==x].size \
for x in unique_cls] for y in unique_col])
dik_d = dik_d/dataset.shape[0]
#splitcol_cls[splitcol_cls[:,-1]==y].shape[0] is two-dimensional, so change to shape
dik_dk = np.array([[splitcol_cls[splitcol_cls[:,0]==y][:,-1][splitcol_cls[splitcol_cls[:,0]==y][:,-1]==x].size/\
splitcol_cls[splitcol_cls[:,0]==y].shape[0] for x in unique_cls] for y in unique_col])
log_dik_dk = dik_dk.ravel()
log_dik_dk = np.where(log_dik_dk>0, np.log2(log_dik_dk), log_dik_dk).reshape((-1, dik_d.shape[1]))
condition_entrop = np.sum(-1 * dik_d * log_dik_dk)
info_gains = dataset_entrop - condition_entrop
return info_gains
def recusive_visit_tree(current_node, last_dict):
if not current_node:
return
if not current_node.split_fea:
last_dict[current_node.split_fea_value] = (current_node.cls, current_node.dataset[current_node.dataset[:,-1]\
==current_node.cls].shape[0]/current_node.dataset.shape[0])
"""
current_node.dataset
(current_node.cls, current_node.dataset[current_node.dataset[:,-1]==current_node.cls].shape[0]/\
current_node.dataset.shape[0])
"""
else:
last_dict[current_node.split_fea_value] = {}
last_dict[current_node.split_fea_value][fea_name[current_node.split_fea]] = {}
recusive_visit_tree(current_node.brother, last_dict)
if current_node.split_fea:
recusive_visit_tree(current_node.first_descendant, last_dict[current_node.split_fea_value][fea_name[current_node.split_fea]])
if __name__ == '__main__':
#年级: 0 青年 1 中年 2 老年
#工作: 0 无 1 有
#房子: 0 无 1 有
#信贷情况:0 一般 1 好 2 非常好
#房贷:0 否 1 是
dataset = np.array([[0,0,0,0,0],
[0,0,0,1,0],
[0,1,0,1,1],
[0,1,1,0,1],
[0,0,0,0,0],
[1,0,0,0,0],
[1,0,0,1,0],
[1,1,1,1,1],
[1,0,1,2,1],
[1,0,1,2,1],
[2,0,1,2,1],
[2,0,1,1,1],
[2,1,0,1,1],
[2,1,0,2,1],
[2,0,0,0,0]])
dc_tree= decisionTree(dataset)
dc_tree.build_tree(dc_tree.root)
tree_dict = {}
recusive_visit_tree(dc_tree.root, tree_dict)
print(tree_dict)
createPlot(tree_dict["root"])
4、结果
{'root': {'house': {0: {'work': {0: (0, 1.0), 1: (1, 1.0)}}, 1: (1, 1.0)}}}