5.1 根据表5.1所给的训练数据集,利用信息增益比(C4.5算法)生成决策树
DTree.py实现了ID3、C4.5做树,而CART只是实现计算Gini肯尼指数,没实现做树,步骤是一样的,我进度拖得比较慢,就没有实现,基本上可以套用。你们也可直接计算下,我计算过一遍然后在直接写代码
# -*- coding: utf-8 -*-
import math
C45_Flag = True#算法标志
ID3_Flag = False
class DtreeStruct:
def __init__(self,next_nodelist=None,Ai=None,Aivalue=None,ck=None,value=None):
self.next_nodelist = next_nodelist
self.Ai = Ai
self.Aival= Aivalue
self.value = value
self.ck = ck
def Print(self):
def Fprint(node):
if node.next_nodelist == None:
print("叶节点:",node.Ai,node.Aival,node.value,node.ck)
return
print("节点",node.next_nodelist,node.Ai,node.Aival,node.value,node.ck)
for node in node.next_nodelist:
Fprint(node)
print("根节点",self.next_nodelist,self.Ai,self.Aival,self.value,self.ck)
for node in self.next_nodelist:
Fprint(node)
class DTree:
def __init__(self,datasets,labels):
self.tree = None
self.datasets = datasets
self.labels = labels
self.GetAandC()
def GetAandC(self):
self.C = {}
self.A = {}
self.C[self.labels[-1]] = set([ line[-1] for line in self.datasets])
for i in range(len(self.labels)-1):
self.A[self.labels[i]] = set([ line[i] for line in self.datasets])
def ID3CreateTree(self,epsilon,ICflag):
#经验熵
def emp_entropy(data,label):
dic = {}
datalen = len(data)
indx = self.labels.index(label)
for line in data:
if line[indx] not in dic:
dic[line[indx]] = 0
#该特征下的取值分类个数(基本上为‘类别’)
dic[line[indx]] += 1
return -sum([(dic[p]/datalen)*math.log(dic[p]/datalen,2) for p in dic.keys()])
#经验条件熵
def emp_cdtl_entropy(data,Ai):
dic = {}
data_dic = {}
c = list(self.C.keys())[0]
indx = self.labels.index(Ai)
datalen = len(data)
for line in data: