机器学习复现4.非递归法建立ID3决策树

最新推荐文章于 2024-08-21 16:56:45 发布

天津泰达康师傅

最新推荐文章于 2024-08-21 16:56:45 发布

阅读量161

点赞数

分类专栏：机器学习文章标签：决策树 python

本文链接：https://blog.csdn.net/qq_50438796/article/details/128759157

版权

机器学习专栏收录该内容

10 篇文章 0 订阅

订阅专栏

ID3选取信息增益作为最优特征的选择方式。

import numpy as np
import pandas as pd


class Leaf_Node(object):
    # 叶子节点
    category = None  # 分类
    def __init__(self,category):
        self.category = category

class Branch_Node(object):
    # 非叶子结点
    category = None   # 某个特征的选择
    character = None   # 该结点对应的特征名
    D = None    # 建立决策树用，该结点对应的数据集
    A = None    #  建立决策树用，该结点对应的特征集
    charac_index = None    # 该结点对应的特征对应的索引
    child_list = None   ## 可加入叶子结点也可加入非叶子结点
    def __init__(self,character,D,A,charac_index):
        self.character = character
        self.D = D
        self.A = A
        self.charac_index = charac_index
        self.child_list = []

class Decision_Tree(object):
    data = None    # 全部数据
    chara = None   # 全部特征
    threshold = None
    decision_tree = None

    def __init__(self):
        ## load data
        data = pd.read_excel("./data.xlsx")
        self.data = data.drop(labels = ['ID'],axis = 1)
        self.chara = self.character(self.data)
        self.threshold = 0

    def duishu(self,p):
        # -p*log(p)
        if p == 0:
            return 0
        else:
            return p * np.log2(p)

    def shang(self,data):
        # 输入数据集data，求得最优特征对应的索引和其信息增益
        data_x,data_y = self.split_x_y(data)
        jingyanshang = 0
        exp_condition_shang = []
        zengyibi = []  # 信息增益比
        H_ad = []
        count_y = data_y.value_counts(normalize=True)
        # print(count_y.index[0][0])
        item_y = list(count_y.index)
        for i in count_y:
            jingyanshang -= self.duishu(i)
        for i in range(data_x.shape[1]):
            exp_cond = 0   ## 经验条件熵
            exp_rate = 0   ## 信息增益比
            data_x_slice = pd.merge(data_x.iloc[:,[i]],data_y,left_index=True,right_index=True)
            count_x = data_x_slice.iloc[:,[0]].value_counts(normalize=True)
            item_x = list(count_x.index)
            for j in range(len(item_x)):
                x_item = item_x[j][0]
                data_x_slice_2 = data_x_slice[data_x_slice[data_x_slice.columns.values[0]] == x_item]
                exp_rate -= self.duishu(data_x_slice_2.shape[0] / data_x_slice.shape[0])
                for k in range(len(item_y)):
                    y_item = item_y[k][0]
                    data_x_slice_3 = data_x_slice_2[data_x_slice_2[data_x_slice_2.columns.values[1]] == y_item]
                    exp_cond -= data_x_slice_2.shape[0] / data_x_slice.shape[0] * self.duishu(data_x_slice_3.shape[0] / data_x_slice_2.shape[0])
            exp_condition_shang.append(exp_cond)
            H_ad.append(exp_rate)
        zengyi = jingyanshang - exp_condition_shang  # 信息增益
        for i in range(len(H_ad)):
            zengyibi.append(zengyi[i] / H_ad[i])
        return np.argmax(zengyi),zengyi[np.argmax(zengyi)]
        # _,self.chara_seq = zip(*(sorted(zip(list(np.argsort(zengyi)),[i for i in range(self.data_x.shape[1])]),reverse=False)))

    def split_x_y(self,D): # D:数据集
        # 划分为x和y
        return D.drop(labels = ['类别'],axis=1,inplace=False),D[['类别']]

    def character(self,D):
        # 求数据集D中的所有特征
        return list(D.drop(labels = ['类别'],axis=1,inplace=False).columns)


    def Build_Tree(self,D,A):
        # 非递归建立决策树
        stack = []
        root = self.Build_Node(D,A)
        stack.append(root)
        while(len(stack)>0):
            cur = stack.pop()
            if(isinstance(cur,Branch_Node)):
                D = cur.D
                A = cur.A
                charac_index = cur.charac_index
                charac_name = cur.character
                A_new = A.copy()
                del A_new[charac_index]  # 删除某一特征
                for i in D[charac_name].value_counts().index:
                    D_new_1 = D[D[charac_name] == i]
                    D_new_1 = D_new_1.drop(labels = [charac_name],axis = 1)
                    mChild = self.Build_Node(D_new_1,A_new)
                    mChild.category = i
                    cur.child_list.append(mChild)
                    stack.append(mChild)
        self.decision_tree = root
        return


    def Build_Node(self,D,A):
        D_X,D_Y = self.split_x_y(D)
        charac_index,character_zengyi = self.shang(D)
        if(len(D_Y.value_counts().index) == 1) or (len(A) == 0) \
            or (character_zengyi < self.threshold):
            return Leaf_Node(D_Y.value_counts(ascending=False).index[0][0])
        else:
            return Branch_Node(A[charac_index],D,A,charac_index)

    def Traverse_Tree(self,test):
        # 搜索决策树，得到结果
        root = self.decision_tree
        while(isinstance(root,Branch_Node)):
            ans = test[root.character][0]
            for i in root.child_list:
                if(i.category == ans):
                    root = i
                    break
        return root.category
d = Decision_Tree()
d.Build_Tree(d.data,d.chara)
test_data = [['青年','否','否','一般']]
test = pd.DataFrame(test_data,columns=['年龄','有工作','有自己的房子','信贷情况'])
print(test,"的预测结果是:")
print(d.Traverse_Tree(test))