# 信息增益、熵

熵定义为信息的期望值，假设某件事发生的概率为p(xi)$p(x_i)$,则信息定义为：

l(xi)=log2(p(xi))

那么对于所有事件的信息熵
H=i=1np(xi)log2(p(xi))

ok ,对于信息熵的理解，到这里就够了。

# code

import math
#计算信息熵
def calcShannonEnt(dataSet):
numlen=len(dataSet)
datatype={}
for data in dataSet:
labels=data[-1]
if labels not in datatype.keys():
datatype[labels]=1
datatype[labels]+=2
ent=0.0
for key in datatype:
p=float(datatype[key])/numlen
ent-=math.log(p,2)*p
return ent
#样本数据
def createDataSet():
dataSet=[[1,1,'eye'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels=['no','flippers']
return dataSet,labels
#根据特征切分数据
def splitData(dataSet,axis,value):
listtemp=[]
for line in dataSet:
if line[axis]==value:
linetemp=[]
linetemp.extend(line)
del linetemp[axis]
listtemp.append(linetemp)
return listtemp
#获得最好的特征
def GetFeatureToSplit(dataSet):
numFeature=len(dataSet[0])-1
baseEnt=0.0
bestFeature=-1
for i in range(numFeature):
featlist=[tmp[i] for tmp in dataSet]
featset=set(featlist)
ent=0.0
for value in featset:
datatemp=splitData(dataSet,i,value)
ent+=calcShannonEnt(datatemp)
if baseEnt<ent:
baseEnt=ent
bestFeature=i
return bestFeature
#特征不足时，用出现最多的类别作为该项的类别
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortClassCount=sorted(classCount.iteritems(),key=lambda d:d[1],reverse=True)
return sortClassCount[0][0]
#创建树
def createTree(dataSet,labels):
classlist=[tmp[-1] for tmp in dataSet]#拿到所有类别
if classlist.count(classlist[0]) == len(classlist):
return classlist[0]
if len(dataSet[0])==1:
return majorityCnt(classlist)
bestFeature=GetFeatureToSplit(dataSet)#获取最适合特征位置
Featurevaluelabels=labels[bestFeature]#获取最适合特征名称
mytree={Featurevaluelabels:{}}
Featurevalue=[tmp[bestFeature] for tmp in dataSet]
FeaturevalSet=set(Featurevalue)
labels1=labels[:]
del(labels1[bestFeature])
for value in FeaturevalSet:#遍历该特征所有可能的值
labelstmp=labels1[:]
mytree[Featurevaluelabels]#递归过程[value]=createTree(splitData(dataSet,bestFeature,value),labelstmp)
return mytree
#分类树
def classify(inputtree,featlabels,testVec):
firstlabels=inputtree.keys()[0]#得到第一个特征名称
featindex=featlabels.index(firstlabels)#得到第一个特征位置
secondekeys=inputtree[firstlabels]
for key in secondekeys.keys():#将第一个特征对应的值与测试数据对应特征的值比对
if testVec[featindex]==key:
if type(secondekeys[key]).__name__=='dict':
claslabel=classify(secondekeys[key],featlabels,testVec)
else:
claslabel=secondekeys[key]
return claslabel

run

# end

《机器学习实战》

©️2019 CSDN 皮肤主题: 编程工作室 设计师: CSDN官方博客