# -*- coding: cp936 -*-
from math import log
def createDataSet():
dataSet=[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels=['no surfacing','flippers']
return dataSet,labels
def calShannonEnt(dataset):
numEntries=len(dataset)#计算数据集中实例的总数
labelCounts={}
for featVec in dataSet:
currentLabel=featVec[-1]
#统计当前类别出现的次数,利用到字典的自动添加功能
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
#能否替换为下面的这一行代码?yes,u can
#labelCounts[currentLabel]=labelCounts.get(currentLabel,0)+1
print labelCounts
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries
shannonEnt-=prob*log(prob,2)
labelCounts是这样子的:
>>> import trees
>>> dataSet,labels=createDataSet
>>> calShannonEnt(dataSet)
{'yes': 1}
{'yes': 2}
{'yes': 2, 'no': 1}
{'yes': 2, 'no': 2}
{'yes': 2, 'no': 3}#labelCounts
0.9709505944546686#香农熵