[Watermelon_book] Chapter 4 Decision Tree

最新推荐文章于 2022-06-22 21:51:40 发布

thu_apple

最新推荐文章于 2022-06-22 21:51:40 发布

阅读量225

点赞数 1

本文链接：https://blog.csdn.net/thutx/article/details/90414478

版权

决策树的基本含义

在这里插入图片描述

建树方法概述

在这里插入图片描述

ID3算法的基本原理

在这里插入图片描述

Python 实现Tree structure的基本形式

Definition of Binary Tree

class BinaryTree:
    def __init__(self,value):
        self.value = value
        self.left_child = None
        self.right_child = None
        
    # insert node
    # if the node doesn't have left node, just add it
    # if the node has left node, then ,insert new node beween the parent and previous child
    def insert_left(self, value):
        if self.left_child == None:
            self.left_child = BinaryTree(value)
        else:
            new_node = BinaryTree(value)
            new_node.left_child = self.left_child
            self.left_child = new_node
    
    
    def insert_right(self, value):
        if self.right_child == None:
            self.right_child = BinaryTree(value)
        else:
            new_node = BinaryTree(value)
            new_node.right_child = self.right_child
            self.right_child = new_node

Build Binary Tree

a --> b --> d

a --> c --> e

a --> c --> f

a_node = BinaryTree("a")
a_node.insert_left("b")
a_node.insert_right("c")
b_node = a_node.left_child
c_node = a_node.right_child
b_node.insert_right('d')
c_node.insert_left('e')
c_node.insert_right('f')
d_node = b_node.right_child
e_node = c_node.left_child
f_node = c_node.right_child

print(a_node.value) # a
print(b_node.value) # b
print(c_node.value) # c
print(d_node.value) # d
print(e_node.value) # e
print(f_node.value) # f

a
b
c
d
e
f

Tree Traversal

DFS(Depth First Search)

Pre-order(1234567)

def pre_order(self):
    print(self.value)
    
    if self.left_child:
        pre_order(self.left_child)
    
    if self.right_child:
        pre_order(self.right_child)

In-order(3241657)

def in_order(self):
    if self.left_child:
        in_order(self.left_child)
        
    print(self.value)
    
    if self.right_child:
        in_order(self.right_child)

Post-order(3426751)

def post_order(self):
    if self.left_child:
        post_order(left_child)
    if self.right_child:
        post_order(right_child)
    print(self.value)

BFS(Broad First Search)

from queue import Queue
def dfs(self):
    queue = Queue()
    queue.put(self)
    while not queue.empty():
        q = queue.get()
        print(q.value)
        if q.left_child:
            queue.put(q.left_child)
        if q.right_child:
            queue.put(q.right_child)

决策树的实际编码

Task

Implement ID3 algorithm and test it with watermelon dataset.

Dataset

def createDataSet():
    """
    创建测试的数据集，里面的数值中具有连续值
    :return:
    """
    dataSet = [
        # 1
        ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460, '好瓜'],
        # 2
        ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.774, 0.376, '好瓜'],
        # 3
        ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.634, 0.264, '好瓜'],
        # 4
        ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.608, 0.318, '好瓜'],
        # 5
        ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.556, 0.215, '好瓜'],
        # 6
        ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.403, 0.237, '好瓜'],
        # 7
        ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', 0.481, 0.149, '好瓜'],
        # 8
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', 0.437, 0.211, '好瓜'],

        # ----------------------------------------------------
        # 9
        ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', 0.666, 0.091, '坏瓜'],
        # 10
        ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', 0.243, 0.267, '坏瓜'],
        # 11
        ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057, '坏瓜'],
        # 12
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', 0.343, 0.099, '坏瓜'],
        # 13
        ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', 0.639, 0.161, '坏瓜'],
        # 14
        ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0.657, 0.198, '坏瓜'],
        # 15
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.360, 0.370, '坏瓜'],
        # 16
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0.593, 0.042, '坏瓜'],
        # 17
        ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0.719, 0.103, '坏瓜']
    ]
    return dataSet

dataSet = createDataSet()

import numpy as np
import pandas as pds
import matplotlib.pyplot as plt

data_in_frame = pds.DataFrame(data=dataSet, columns=["色泽","根蒂","敲声","纹理","脐部","触感","-","-","好瓜"])
column_name = data_in_frame.columns
column_name = column_name[column_name != "-"]
data_in_frame = data_in_frame.loc[:, column_name]

data_in_frame

	色泽	根蒂	敲声	纹理	脐部	触感	好瓜
0	青绿	蜷缩	浊响	清晰	凹陷	硬滑	好瓜
1	乌黑	蜷缩	沉闷	清晰	凹陷	硬滑	好瓜
2	乌黑	蜷缩	浊响	清晰	凹陷	硬滑	好瓜
3	青绿	蜷缩	沉闷	清晰	凹陷	硬滑	好瓜
4	浅白	蜷缩	浊响	清晰	凹陷	硬滑	好瓜
5	青绿	稍蜷	浊响	清晰	稍凹	软粘	好瓜
6	乌黑	稍蜷	浊响	稍糊	稍凹	软粘	好瓜
7	乌黑	稍蜷	浊响	清晰	稍凹	硬滑	好瓜
8	乌黑	稍蜷	沉闷	稍糊	稍凹	硬滑	坏瓜
9	青绿	硬挺	清脆	清晰	平坦	软粘	坏瓜
10	浅白	硬挺	清脆	模糊	平坦	硬滑	坏瓜
11	浅白	蜷缩	浊响	模糊	平坦	软粘	坏瓜
12	青绿	稍蜷	浊响	稍糊	凹陷	硬滑	坏瓜
13	浅白	稍蜷	沉闷	稍糊	凹陷	硬滑	坏瓜
14	乌黑	稍蜷	浊响	清晰	稍凹	软粘	坏瓜
15	浅白	蜷缩	浊响	模糊	平坦	硬滑	坏瓜
16	青绿	蜷缩	沉闷	稍糊	稍凹	硬滑	坏瓜

Entropy

def entropy(y):
    """
    y:  sample * (feature + label)
        dataframe
    """
    label = y.iloc[:, -1]
    typeoflabel = label.value_counts()
    numoflabel = len(typeoflabel)
    
    ent = 0
    for i in range(numoflabel):
        p = typeoflabel[i]/len(label)
        ent += -p*np.log2(p) 
    return ent

entropy(data_in_frame)

0.9975025463691153

Info Gain

def info_gain(y,split_feature):
    """
    y: sample * (feature + label)
    split_feature_index: current feature
    """
    old_ent = entropy(y)
    label = y.iloc[:,-1]
    target_data = y[split_feature]
    new_data = pds.concat([target_data, label],axis=1)
    type_of_new_data = list(set(new_data.iloc[:,0]))
    num_of_type_of_new_data = len(type_of_new_data)
    size_of_new_data = len(new_data)
    
    ent = 0
    for i in range(num_of_type_of_new_data):
        data_current = new_data[new_data[split_feature] == type_of_new_data[i]]
        size_of_data_current = len(data_current)
        p_of_data_current = size_of_data_current / size_of_new_data
        ent_current = p_of_data_current*entropy(data_current)
        ent += ent_current
    gain = old_ent - ent
    return gain

info_gain(data_in_frame, "色泽")

0.10812516526536531

Best selection

def find_winner(y):
    IG = []
    for key in y.keys()[:-1]:
        IG.append(info_gain(y, key))
        best_key = y.keys()[np.argmax(IG)]
    return best_key

find_winner(data_in_frame)

'纹理'

Get subtable

def get_subtable(df, split_feature, feature_value):
    return df[df[split_feature]==feature_value].reset_index(drop=True)

ID3 Tree

A wrong example

This part shows a wrong example which change the feature space during the decision.
Or to say, the feature space is involved into recursion

def id3(data):
    Class = data.keys()[-1]    #label
    node = find_winner(data)
    attvalue = np.unique(data[node])
    tree = {}
    tree[node] = {}
    for value in attvalue:
        subtable = get_subtable(data, node,value)
        clvalue, counts = np.unique(subtable[Class], return_counts=True)
        
        if len(clvalue) == 1:
            tree[node][value] = clvalue[0]
        else:
            tree[node][value] = id3(subtable)
    return tree

tree = id3(data_in_frame)
import pprint
pprint.pprint(tree)

{'纹理': {'模糊': '坏瓜',
        '清晰': {'根蒂': {'硬挺': '坏瓜',
                      '稍蜷': {'色泽': {'乌黑': {'触感': {'硬滑': '好瓜', '软粘': '坏瓜'}},
                                    '青绿': '好瓜'}},
                      '蜷缩': '好瓜'}},
        '稍糊': {'触感': {'硬滑': '坏瓜', '软粘': '好瓜'}}}}

A right example

def buildtree(data):
    #store the feature space first
    attvalue_table = {}
    for key in data.keys()[:-1]:
        attvalue_table[key] = np.unique(data[key])
    #print(attvalue_table)
    def id3(data):
        Class = data.keys()[-1]    #label
        node = find_winner(data)
        #print(node)
        attvalue = attvalue_table[node]
        #print(attvalue)
        tree = {}
        tree[node] = {}
        for value in attvalue:
            subtable = get_subtable(data, node,value)
            clvalue, counts = np.unique(subtable[Class], return_counts=True)

            if len(clvalue) == 1:
                tree[node][value] = clvalue[0]
            elif len(clvalue) == 0:
                clvalue, counts = np.unique(data[Class], return_counts=True)
                tree[node][value] = clvalue[np.argmax(counts)]
            else:
                tree[node][value] = id3(subtable)
        return tree
    tree = id3(data)
    return tree

tree = buildtree(data_in_frame)
import pprint
pprint.pprint(tree)

{'纹理': {'模糊': '坏瓜',
        '清晰': {'根蒂': {'硬挺': '坏瓜',
                      '稍蜷': {'色泽': {'乌黑': {'触感': {'硬滑': '好瓜', '软粘': '坏瓜'}},
                                    '浅白': '好瓜',
                                    '青绿': '好瓜'}},
                      '蜷缩': '好瓜'}},
        '稍糊': {'触感': {'硬滑': '坏瓜', '软粘': '好瓜'}}}}

You can see “浅白” value in “色泽” feature has been considered now.

Predict

def predict_inst(inst, tree):
    if type(tree) is not dict:
        return tree
    else:
        key = list(tree.keys())[0]
        value = inst[key]
        tree = tree[key][value]
        return predict_inst(inst, tree)

def predict(data, tree):
    """
    data: sample * features
    """
    prediction = []
    for i in range(len(data)):
        x = data.iloc[i,:]
        prediction.append(predict_inst(x, tree))
    return prediction

# Here I just use original data as test data 
# Just for verify the prediction code is right(if accuracy is 100%)
test_data = data_in_frame.iloc[:,:-1]
label_of_test_data = data_in_frame.iloc[:,-1]
label_predicted = predict(test_data, tree)

Accuracy = np.sum(label_of_test_data == label_predicted)/len(label_predicted)
print("{:.2%}".format(Accuracy))