def chooseBestFeatureToSplit(dataset):
numFeature = len(dataset[0])-1 #划分后特点的数量
BasehannonEnt= calcshannonEnt(dataset) #初始的香农熵
BestshannonEnt = 0.0 #初始化最优香农熵
BestInfoGain = 0.0 #最优信息增益
BestFeature = -1 #最优特点的坐标
for i in range(numFeature):
featList = [sample[i] for sample in dataset] #提取出特点在数据集中的取值
uniqueVals = set(featList) #找到可能的取值
newEntropy = 0.0 #初始化新香农熵
for value in uniqueVals: #遍历每种可能划分数据集,因为有两个除数据集以外的参数,所以两重循环
subdataset = splitdataset(dataset,i,value)
prob = len(subdataset)/float(len(dataset))
newEntropy +=prob *calcshannonEnt(subdataset) #分别计算他们的香农熵
infoGain = BasehannonEnt - newEntropy #信息增益
#找到最优的信息增益
if (infoGain>BestInfoGain):
BestInfoGain = infoGain
BestFeature = i
return BestFeature
决策树-选择最优的划分数据集的特点
最新推荐文章于 2024-03-08 11:32:45 发布