ID3是一棵多叉树,这一棵树采用递归的方式构造
- 第一步根节点的构造,遍历所有特征,找到那个使分类信息增益最大的特征,将其设置为根节点,并且讲这个feature删除掉
- 由于根节点已经将数据分叉,递归的方式寻找每个分枝的最优特征3
- id3采用信息增益来选取最优分裂特征
#ID3算法
def ID3_chooseBestFeatureToSplit(dataset):
numFeatures=len(dataset[0])-1
baseEnt=jisuanEnt(dataset)
bestInfoGain=0.0
bestFeature=-1
for i in range(numFeatures): #遍历所有特征
#for example in dataset:
#featList=example[i]
featList=[example[i]for example in dataset]
uniqueVals=set(featList) #将特征列表创建成为set集合,元素不可重复。创建唯一的分类标签列表
newEnt=0.0
for value in uniqueVals: #计算每种划分方式的信息熵
subdataset=splitdataset(dataset,i,value)
p=len(subdataset)/float(len(dataset))
newEnt+=p*jisuanEnt(subdataset)
infoGain=baseEnt-newEnt
# print(u"ID3中第%d个特征的信息增益为:%.3f"%(i,infoGain))
if (infoGain>bestInfoGain):
bestInfoGain=infoGain #计算最好的信息增益
bestFeature=i
return bestFeature
#利用ID3算法创建决策树
def ID3_createTree(dataset,labels):
classList=[example[-1] for example in dataset]
if classList.count(classList[0]) == len(classList):
# 类别完全相同,停止划分
return classList[0]
if len(dataset[0]) == 1:
# 遍历完所有特征时返回出现次数最多的
return majorityCnt(classList)
bestFeat = ID3_chooseBestFeatureToSplit(dataset)
bestFeatLabel = labels[bestFeat]
# print(u"此时最优索引为:"+(bestFeatLabel))
ID3Tree = {bestFeatLabel:{}}
# print(bestFeatLabel)
del(labels[bestFeat])
# 得到列表包括节点所有的属性值
featValues = [example[bestFeat] for example in dataset]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
#################################递归
sub_dataset=splitdataset(dataset, bestFeat, value)
ID3Tree[bestFeatLabel][value] = ID3_createTree(sub_dataset, subLabels)
print(ID3Tree)
return ID3Tree