1.香农熵
信息熵是什么?
参考
https://www.zhihu.com/question/22178202/answer/49929786 知乎大神
香农熵代表数据的无序程度,香农熵越高,无序程度越高
2.计算香农熵
def calcShannonEnt(dataSet):
"""
计算香农熵
"""
numEntries = len(dataSet)#获取行数
labelCounts = {}
#计算出现的次数p(x)
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts [currentLabel] += 1
#累计p(x)*log2p(x)
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob*log(prob,2)
return shannonEnt
给出数据集合
def createDataSet():
dataSet = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
labels = ['no surfacing', 'flippers']
return dataSet, labels
调用
import org.gfzs.machineLearning.decisionTree as dsTree
myDat, labels = dsTree.createDataSet()
print("myDat:{}".format(myDat))
shannonEnt = dsTree.calcShannonEnt(myDat)
print("shannonEnt:{}".format(shannonEnt))
输出结果
myDat:[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
shannonEnt:0.9709505944546686
3.划分数据集
def splitDataSet(dataSet, axis, value):
"""
划分数据集 形成特征树的分支
dataSet:待划分的数据集合
axis: 划分数据集的特征
value: 需要返回的特征的值
"""
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reduceFeatVec = featVec[:axis]
reduceFeatVec.extend(featVec[axis+1:])
retDataSet.append(reduceFeatVec)
return retDataSet
选择最优的数据划分方式,也就是根据某项特征值完成划分之后的数据集合的香农熵大,也就是说,要选择最优的特征值划分使得数据无序的程度大,这样分组能更优的表示数据
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1 #特征值得数量
baseEntropy = calcShannonEnt(dataSet) #计算香农熵
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures) :
#创建唯一的特征值集合
featList = [example[i] for example in dataSet] #第i列的特征值集合
uniqueVals = set(featList)#去重
#计算每种划分方式的香农熵
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob*calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
#选择最大的香农熵
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
根据实例来看:第0个特征值是最优的。再验证一下数据,根据第0个特征值,若是否那么非鱼类,而根据第1个特征值,划分后,无论是否,需要再次划分。那么就认为特征0划分的效果更好。