决策树-选择最优的划分数据集的特点

最新推荐文章于 2024-03-08 11:32:45 发布

stan-zjx

最新推荐文章于 2024-03-08 11:32:45 发布

阅读量949

点赞数

本文链接：https://blog.csdn.net/weixin_40253095/article/details/82424736

版权

def chooseBestFeatureToSplit(dataset):
numFeature = len(dataset[0])-1 #划分后特点的数量
BasehannonEnt= calcshannonEnt(dataset) #初始的香农熵
BestshannonEnt = 0.0 #初始化最优香农熵
BestInfoGain = 0.0 #最优信息增益
BestFeature = -1 #最优特点的坐标
for i in range(numFeature):
featList = [sample[i] for sample in dataset] #提取出特点在数据集中的取值
uniqueVals = set(featList) #找到可能的取值
newEntropy = 0.0 #初始化新香农熵
for value in uniqueVals: #遍历每种可能划分数据集，因为有两个除数据集以外的参数，所以两重循环
subdataset = splitdataset(dataset,i,value)
prob = len(subdataset)/float(len(dataset))
newEntropy +=prob *calcshannonEnt(subdataset) #分别计算他们的香农熵

infoGain = BasehannonEnt - newEntropy #信息增益
#找到最优的信息增益
if (infoGain>BestInfoGain):
BestInfoGain = infoGain
BestFeature = i
return BestFeature