决策树通过计算香农熵来度量数据集划分前后的信息增益,信息增益高的就是最好的特征
P(x)是分类的概率。
python实现香农熵
from math import log
def calcshannonent(dataset):
n=len(dataset)
labels={}
shannon=0.0
for i in dataset:
bq=i[-1]
labels.setdefault(bq,0)
labels+=1
for key in labels:
prob=labels[key]/n
shannon=prob*log(prob,2)
return shannon
from math import log
#计算字典从大到小排序
def mojority(classlist):
classcount={}
for i in classlist:
classcount.setdefault(i,0)
classcount[i]+=1
classpaixu=sorted(classcount.items(),key=lambda x:x[1],reverse=True)
return classpaixu[0][0]
#计算香浓熵
def calcshannonent(dataset):
n=len(dataset)
labels={}
shannon=0.0
for i in dataset:
bq=i[-1]
labels.setdefault(bq,0)
labels[bq]+=1
for key in labels:
prob=labels[key]/n
shannon-=prob*log(prob,2)
return shannon
def createdataset():
dataset=[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels=['no surfacing','flippers']
return dataset,labels
f,d=createdataset()
#划分数据集
def splitdataset(dataset,axis,value):
retdataset=[]
for feac in dataset:
if feac[axis]==value:
reduce=feac[:axis]
reduce.extend(feac[axis+1:])
retdataset.append(reduce)
return retdataset
#选择最优划分数据集属性
def choosebestsplit(dataset):
n=len(dataset[0])-1
bestbigshannon=calcshannonent(dataset)
shannongzz=0.0
feaut=1
for i in range(n):#遍历特征
tezheng=[j[i] for j in dataset]
tezhengset=set(tezheng)
shannon=0.0
for value in tezhengset:#遍历特征值
redataset=splitdataset(dataset,i,value)
prob=len(redataset)/len(dataset)
shannon+=prob*calcshannonent(redataset)
infoshannon=bestbigshannon-shannon#选取香浓熵最小的特征
if (infoshannon>shannongzz):
shannongzz=infoshannon
feaut=i
return feaut
#创建决策树
def createtree1(dataset,label):
classlist=[example[-1] for example in dataset]
#停止条件:当只剩下一种标签时停止
if classlist.count(classlist[0])==len(classlist):
return classlist[0]
#停止条件:遍历所有后还没有满足条件1,则返回标签最多的
if len(dataset[0])==1:
return mojority(classlist)
feat=choosebestsplit(dataset)#计算最优特征
featname=label[feat]
mytree={featname:{}}
del label[feat]#此处为重点
tezheng_1=[i[feat] for i in dataset]
tezheng_se=set(tezheng_1)
for value in tezheng_se:
sublabel=label[:]
mytree[featname][value]=createtree1(splitdataset(dataset,feat,value),sublabel)
return mytree
#创建决策树分类器
def classify(inputtree,labels,testvec):
firststr=list(inputtree.keys())[0]
seconddice=inputtree[firststr]#提取字典
featindex=labels.index(firststr)#返回标签的索引
for key in seconddice.keys():
if testvec[featindex]==key:
if type(seconddice[key]).__name__=='dict':
classlabel=classify(seconddice[key],labels,testvec)
else:
classlabel=seconddice[key]
return classlabel