2020-11-05_csdn yjy-CSDN博客

本文链接：https://blog.csdn.net/yjylalala/article/details/109516729

关于python实现决策树算法

ID3，C4.5、CART 都在best_split_method方法中可以看见

基本上跟着大佬的思路走
这里记录下自己写过程中遇到的错误
错误1：

if result not in all_entropy_dic.keys():
            all_entropy_dic[result]=0
       else:
       		all_entropy_dic[result]+=1
           #这样会少记录第一个，导致后面运算错误

这里的 best_split_method（）方法中实际上是没有用到feature来进行运算的
只是我拿来看是哪个属性来算各种熵与增益的。
在一开始写的时候，会太强调去找对应的特征名字
后面发现其实就是选择列来运算就简单很多了。

from math import log
import operator  #在这里使用来排序的
# import treePlotter
#import draw_tree

dataSet =     [['sunny', 'hot', 'high', 'weak', 'no'],
               ['sunny', 'hot', 'high', 'strong', 'no'],
               ['overcast', 'hot', 'high', 'weak', 'yes'],
               ['rain', 'mild', 'high', 'weak', 'yes'],
               ['rain', 'cool', 'normal', 'weak', 'yes'],
               ['rain', 'cool', 'normal', 'strong', 'no'],
               ['overcast', 'cool', 'normal', 'strong', 'yes'],
               ['sunny', 'mild', 'high', 'weak', 'no'],
               ['sunny', 'cool', 'normal', 'weak', 'yes'],
               ['rain', 'mild', 'normal', 'weak', 'yes'],
               ['sunny', 'mild', 'normal', 'strong', 'yes'],
               ['overcast', 'mild', 'high', 'strong', 'yes'],
               ['overcast', 'hot','normal', 'weak', 'yes'],
               ['rain', 'mild', 'high', 'strong', 'no']]
labels = ['outlook','temperature','humidity','wind']
# 这里需要用二维数据结构来表示数据集
def calculate_all_entropy(dataset):
    #这里dataset是二维列表
    datalen = len(dataset)   #有多少行数据
    # print(datalen)
    all_entropy_dic={}
    for feature_vector in dataset:      #先计算总体熵
        result = feature_vector[-1]     #最后一个是结果 yes or no
        if result not in all_entropy_dic.keys():
            all_entropy_dic[result]=0       #不在的话 就设为0
        all_entropy_dic[result]+=1
    # print(all_entropy_dic)
    # sum=0
    # for i in all_entropy_dic:
    #     sum += all_entropy_dic[i]    #实际上就是等于datalen
    all_entropy=0
    for i in all_entropy_dic:
        p = float(all_entropy_dic[i]/datalen)
        all_entropy -= p * log(p, 2)
    # print(all_entropy)
    return all_entropy

def split_dataset_method(dataset,i,feature):
    # 去除某个属性之后的数据集（去掉现存最优属性之后）
    split_datase = []
    for feature_vec in dataset:
        if feature_vec[i]==feature:
            temp = feature_vec[:i]
            temp.extend(feature_vec[i+1:])
            split_datase.append(temp)
    # print(split_datase)
    return split_datase

def best_split_method(dataset,feature):
    feature_num = len(dataset[0])-1   #减去最后一行的no or yes
    all_entropy = calculate_all_entropy(dataset)
    best_gain = 0.0
    best_feature_id = -1
    info_gain = 0.0
    for i in range(feature_num):
        list1 = [line[i] for line in dataset]  #取其列
        unique_value = set(list1)    # 消除重复出现的
        new_entropy = 0.0
        s = 0.0
        for sub_feature in unique_value:   #开始分割
            sub_dataset = split_dataset_method(dataset,i,sub_feature)  #划分之后的子数据集
            p = len(sub_dataset)/float(len(dataset))      #子数据集占据总样本比例
            new_entropy += p * calculate_all_entropy(sub_dataset)    #比例乘子集的信息熵
            s -= p*log(p, 2)   #子样本的固定值
        gain = all_entropy-new_entropy  #信息增益
        info_gain = gain/s   #信息增益比
        print("信息熵为："+str(all_entropy))
        print(str(feature[i])+'的条件熵为'+str(new_entropy))
        print(str(feature[i]) + '的信息增益为' + str(gain))
        print(str(feature[i]) + '的信息增益比为' + str(info_gain))
        if info_gain > best_gain:  #选择信息增益比最高的
            best_gain = info_gain
            best_feature_id = i
    # print(best_feature_id)
    return best_feature_id

def sort_by_frequency(datalist):
    occur_count = {}
    for i in datalist:
        if i not in occur_count.keys():
            occur_count[i]=0
        occur_count[i]+=1
    sorted_list = sorted(occur_count.items(),key=operator.itemgetter(1),reverse=True)
    print(sorted_list)
    # print(sorted_list[0][0])
    return sorted_list[0][0]

def create_tree(dataset,feature):
    result_list = [line[-1] for line in dataset]
    if result_list.count(result_list[0]) == len(result_list):  #如果都为yes或者都为no 就不用分了
        return result_list[0]
    if len(dataset[0])==1:   #如果只有一个特征
        return sort_by_frequency(dataset)
    best_feature = best_split_method(dataset,feature)
    best_label = feature[best_feature]
    decsion_tree = {best_label:{}}  #以字典的形式来存储
    del(feature[best_feature])
    feature_values = [example[best_feature] for example in dataset]
    unique_values = set(feature_values)
    for value in unique_values:
        sub_feature = feature[:]  #先全部给过去
        temp = split_dataset_method(dataset,best_feature,value)
        decsion_tree[best_label][value] = create_tree(temp,sub_feature)
    return decsion_tree

if __name__=="__main__":
    print(create_tree(dataSet,labels))
    # draw_tree.createPlot(create_tree(dataSet,labels))


# calculate_all_entropy(dataSet)
# split_dataset_method(dataSet,0,'rain')
# best_split_method(dataSet,labels)
# sort_by_frequency(['yes','no','no','yes','no'])