# -*- coding: utf-8 -*-
"""
Created on Thu Dec 27 20:42:09 2018
@author: Aomo Jan
《统计学习方法》第5章 决策树
熵,条件熵,信息增益,信息增益比的计算
"""
from math import log
def calcEntropy(dataSet,attrIndex=-1,logBase=2):
"""
计算一个数据集再第attrIndex个属性上的熵。默认是最后一个属性,一般将样本类别最为最后一列.
默认的底数为2。
"""
numEntries=len(dataSet)
labelCounts={}
for featVec in dataSet:
curLabel=featVec[attrIndex]
if curLabel not in labelCounts.keys():
labelCounts[curLabel]=0
labelCounts[curLabel]+=1
shannonEnt=0.0
for k in labelCounts.keys():
p=float(labelCounts[k])/numEntries
shannonEnt-=p*log(p,logBase)
return shannonEnt
def calcConditionalEntropy(dataSet,xIndex,yIndex=-1,logBase=2):
"""
计算xIndex对应属性的条件熵
"""
#计算P(X=xi)
numEntries=len(dataSet)
labelCounts={}
px={}
for featVec in dataSet:
curLabel=featVec[xIndex]
if curLabel not in labelCounts.keys():
labelCounts[curLabel]=0
labelCounts[curLabel]+=1
for k in labelCounts.keys():
p=float(labelCounts[k])/numEntries
px[k]=p
condEnt=0.0
for i in px.keys():
di=[ ds for ds in dataSet if ds[xIndex]==i]
entDi=calcEntropy(di)
condEnt+=px[i]*entDi
return condEnt
def loadDataSet():
demoDataSet=testSet=[
['青年',0,0,'一般',0],
['青年',0,0,'好',0],
['青年',1,0,'好',1],
['青年',1,1,'一般',1],
['青年',0,0,'一般',0],
['中年',0,0,'一般',0],
['中年',0,0,'好',0],
['中年',1,1,'好',1],
['中年',0,1,'非常好',1],
['中年',0,1,'非常好',1],
['老年',0,1,'非常好',1],
['老年',0,1,'好',1],
['老年',1,0,'好',1],
['老年',1,0,'非常好',1],
['老年',0,0,'一般',0],
]
return demoDataSet
def testDemo():
testSet=loadDataSet()
Hd=calcEntropy(testSet)
print('经验熵H(D)=',Hd)
for i in range(4):
Hcai=calcConditionalEntropy(testSet,i)
print('第%d个属性的条件熵Hch%d=%f' % (i+1,i+1,Hcai))
gDA=Hd-Hcai
print('第%d个属性的信息增益g(D,A%d)=%f' % (i+1,i+1,gDA))
Hai=calcEntropy(testSet,i)
print('第%d个属性的信息增益比gr(D,A%d)=%f' % (i+1,i+1,gDA/Hai))
#Hch1=calcConditionalEntropy(testSet,0)
#print('第一个属性的条件熵Hch1=',Hch1)
if __name__=='__main__':
testDemo()
欢饮关注我们: