1.信息无序程度的度量:熵和基尼不纯度
2.信息增益:无序度的减小 越大越好
3.选择特征划分数据集,使信息增益最大。
from math import log
import operator
def calcShannonEnt(dataset):
numEntries = len(dataset)
#print('numEntries is:',numEntries)
lableCounts= {}
for feaVec in dataset:
currentLable = feaVec[-1]
if currentLable not in lableCounts.keys():
lableCounts[currentLable] = 0
lableCounts[currentLable]+=1
print('lableCounts:',lableCounts)
shannonEnt=0.0
for key in lableCounts:
#print(lableCounts[key])
prob=float (lableCounts[key])/numEntries
shannonEnt-= prob*log(prob,2)
return shannonEnt
def createDataSet():
dataSet = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
lables = ['no surfacing','flippers']
return dataSet ,lables
def splitDataSet(dataSet,axis,value):
retDataSet=[]
for featVec in dataSet:
#print(featVec[axis])
if featVec[axis]==value:
reducedFeatVec = featVec[:axis]
#print('featVec[axis+1] is:',featVec[axis+1:])
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1
baseEntropy =calcShannonEnt(dataSet)
bestInfoGain = 0.0;
bestFeature = -1
for i in range(numFeatures):
featList = [examples[i] for examples in dataSet] #每个特征的所有值
uniqueVals = set(featList) #每个特征的唯一值
#print('uniqueVals is:',uniqueVals)
newEntropy = 0.0
for value in uniqueVals: #每个特征下按不同值划分数据集的熵
subDataSet = splitDataSet(dataSet,i,value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy = prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if(infoGain>bestInfoGain):
bestInfoGain =infoGain
bestFeature=i
return bestFeature