使用信息增益构建决策树


包含了离散变量与连续变量

def calculate_ent(Dataset):
    num_total = len(Dataset)
    LabelsCount = {}
    for FeatureVector in Dataset:
        CurrentLabel = FeatureVector[-1]
        if CurrentLabel not in LabelsCount.keys():
            LabelsCount[CurrentLabel] = 0
        LabelsCount[CurrentLabel] += 1
    entropy = 0
    for key in LabelsCount:
        p = float(LabelsCount[key])/num_total
        entropy -= p * log(p, 2)
    return entropy
# 得到某特征取值为某值的子集
def GetSubset(Dataset, feature, value):
    Subset = []
    for FeatureVector in Dataset:
        if FeatureVector[feature] == value:
            SubVector = np.hstack((FeatureVector[:feature], FeatureVector[(feature+1):]))
            Subset.append(SubVector)
    return Subset
# 连续情形获取子集
def GetSubset_continue(Dataset, feature, value):
    Subset_left = []
    Subset_right = []
    for FeatureVector in Dataset:
        if FeatureVector[feature] <= value:
            Subset_left.append(FeatureVector)
        elif FeatureVector[feature] > value:
            Subset_right.append(FeatureVector)
    return Subset_left, Subset_right
# 计算信息增益
def calculate_gain(Dataset, feature, continue_label=0):
    t = 0
    if continue_label == 0:
        feature_value = set([FeatureVector[feature] for FeatureVector in Dataset])
        gain = calculate_ent(Dataset)
        for value in feature_value:
            Subset = GetSubset(Dataset, feature, value)
            gain -= (len(Subset)/len(Dataset))*calculate_ent(Subset)
    if continue_label == 1:
        T = [FeatureVector[feature] for FeatureVector in Dataset]
        T = sorted(T)
        value_select = half_apart(T)
        gain = -np.Inf
        for value in value_select:
            Subset_left, Subset_right = GetSubset_continue(Dataset, feature, value)
            gain0 = calculate_ent(Dataset) - len(Subset_left)/len(Dataset)*calculate_ent(Subset_left) - \
                   len(Subset_right)/len(Dataset)*calculate_ent(Subset_right)
            if gain < gain0:
                gain = gain0
                t = value
    return gain, t

# 选择连续变量可能的分点
def half_apart(T):
    T1 = copy.deepcopy(T)
    T.insert(0,0)
    T.pop(-1)
    T = np.array(T)
    T1 = np.array(T1)
    arr = np.mean([T,T1], axis = 0)
    return arr
# 选择最优特征
def ChooseBestfeature(Dataset,Featurelabel):
    BestGain, t = calculate_gain(Dataset, 1, Featurelabel[0])
    BestFeature = 1
    for i, label in enumerate(Featurelabel):
        Gain, t = calculate_gain(Dataset, i, label)
        if Gain >= BestGain:
            BestFeature = i
            BestGain = Gain
    return BestFeature

# 投票表决
import operator
def MostChoice(Dataset):
    LabelsCount = {}
    for FeatureVector in Dataset:
        CurrentLabel = FeatureVector[-1]
        if CurrentLabel not in LabelsCount.keys():
            LabelsCount[CurrentLabel] = 0
        LabelsCount[CurrentLabel] += 1
    sortedClassCount = sorted(LabelsCount.items(), key=operator.itemgetter(1), reverse = True)
    return sortedClassCount[0][0]
# 可划分性
def Severability(Dataset):
    data = np.delete(Dataset, -1, axis=1)
    severability = True
    for i in range(len(data)):
        for j in range(i+1,len(data)):
            if (data[i] == data[j]).all():
                severability = False
    return severability
#
def CreatTree(DataSet, Featurelabel, Feature_names):
    LabelList = [FeatureVector[-1] for FeatureVector in DataSet]
    #只有一种类别
    if LabelList.count(LabelList[0]) == len(LabelList):
        return LabelList[0]
    #无法进行划分或特征集为0
    if Severability(DataSet) == False or len(DataSet[0]) == 1:
        return MostChoice(DataSet)
    #最优特征选择
    bestFeature = ChooseBestfeature(DataSet, Featurelabel)
    bestFeaturename = Feature_names[bestFeature]
    bestFeaturelabel = Featurelabel[bestFeature]
    MyTree = {bestFeaturename:{}}
    if Featurelabel[bestFeature] == 0:
        del (Feature_names[bestFeature])
        del (Featurelabel[bestFeature])
    # 最优属性的每个值,生成分支,找到子集
    if bestFeaturelabel == 0:
        bestFeaturevalue = set(FeatureVector[bestFeature] for FeatureVector in DataSet)
        for value in bestFeaturevalue:
            Subset = GetSubset(DataSet, bestFeature, value)
            SubLabels = Featurelabel[:]
            Subnames = Feature_names[:]
            MyTree[bestFeaturename][value] = CreatTree(Subset, SubLabels, Subnames)
    if bestFeaturelabel == 1:
        gain, t = calculate_gain(DataSet, bestFeature, continue_label=1)
        tip = ['<='+str(t), '>'+str(t)]
        Subsetleft, Subsetright = GetSubset_continue(DataSet, bestFeature, t)
        for i, value in enumerate(tip):
            if i == 0:
                Subset = Subsetleft
            else:
                Subset = Subsetright
            SubLabels = Featurelabel[:]
            Subnames = Feature_names[:]
            MyTree[bestFeaturename][value] = CreatTree(Subset, SubLabels, Subnames)
    return MyTree

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值