西瓜书4.3题python代码实现

最新推荐文章于 2022-06-08 22:19:59 发布

Futile_H

最新推荐文章于 2022-06-08 22:19:59 发布

阅读量2k

点赞数 5

分类专栏： MachineLearning 文章标签：机器学习决策树西瓜书

本文链接：https://blog.csdn.net/HuanglnQuan/article/details/102887162

版权

MachineLearning 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

西瓜书4.3题python代码实现

简介
数据集
代码实现

简介

编程实现基于交叉熵进行划分选择的决策树算法，并为表4.3中数据生成一颗决策树。

数据集

编号,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.46,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,是
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,0.403,0.237,是
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,是
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,是
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,否
10,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,否
11,浅白,硬挺,清脆,模糊,平坦,硬滑,0.245,0.057,否
12,浅白,蜷缩,浊响,模糊,平坦,软粘,0.343,0.099,否
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,0.639,0.161,否
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,0.657,0.198,否
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,0.36,0.37,否
16,浅白,蜷缩,浊响,模糊,平坦,硬滑,0.593,0.042,否
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,0.719,0.103,否

代码实现

这里的代码主要分成两个部分，一个是画出决策树，一个是生成决策树数据。个人感觉这题难的地方是画出决策树。。。

1、画决策树

这里画决策树的算法是引用了[另外一位大佬](https://blog.csdn.net/weixin_37922777/article/details/88821957)的，然后因为他的算法只能画二叉树，然后自己改了一下。
这里主要是以字典的形式生成树，所以到时候核心算法生成节点的时候用**字典来存储**。<br>
格式大概是这样： {父节点：{{’描述1‘，子节点1}，{’描述2‘，子节点2}}}

import matplotlib.pyplot as plt
from pylab import *
 
# 定义文本框和箭头格式
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
mpl.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体
mpl.rcParams['axes.unicode_minus'] = False #解决保存图像是负号'-'显示为方块的问题
 
def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString)
 

def plotNode(nodeTxt, centerPt, parentPt, nodeType): # 绘制带箭头的注解
    createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords="axes fraction", xytext=centerPt, textcoords="axes fraction", va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
 
def getNumLeafs(myTree): # 获取叶节点的数目
    numLeafs = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    if type(secondDict).__name__ == 'dict':
        for key in secondDict.keys():
            if type(secondDict[key]).__name__ == 'dict':
                numLeafs += getNumLeafs(secondDict[key])
            else: numLeafs += 1
    else:
        numLeafs += 1
    return numLeafs
 
def getTreeDepth(myTree): # 获取树的层数
    maxDepth = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    if type(secondDict).__name__ == 'dict':
        for key in secondDict.keys():
            if type(secondDict[key]).__name__ == 'dict':
                thisDepth = 1 + getTreeDepth(secondDict[key])
            else: 
                thisDepth = 1
            if thisDepth > maxDepth: 
                maxDepth = thisDepth
    else:
        maxDepth = 1
    return maxDepth
 
def retrieveTree(i): # 获取预定义的树
    listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
                  {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
                  ]
    return listOfTrees[i]
 
def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree)
    getTreeDepth(myTree)
    firstStr = list(myTree.keys())[0]
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW,\
    plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    if type(secondDict).__name__=='dict':
        for key in secondDict.keys():
            if type(secondDict[key]).__name__=='dict':
                plotTree(secondDict[key],cntrPt,str(key))
            else:
                plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
                plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff),
                cntrPt, leafNode)
                plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    else:
        plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
        plotNode(secondDict, (plotTree.xOff, plotTree.yOff),cntrPt, leafNode)
        plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(secondDict))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
 
def createPlot(inTree):
	fig = plt.figure(1, figsize=(12, 8), facecolor='white')
	fig.clf()
	axprops = dict(xticks=[], yticks=[])
	createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
	plotTree.totalW = float(getNumLeafs(inTree))
	plotTree.totalD = float(getTreeDepth(inTree))
	plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
	plotTree(inTree, (0.5,1.0), '')
	plt.show()

决策树生成算法：

这个没什么好说的，就按照树上的伪代码实现就好了，主要要注意的就是创建节点的那几个地方可能需要自己好好琢磨一下，怎么把那个字典构造出来。

import pandas as pd
import numpy as np
# 简单的数据处理，把标签转换为数字格式
# 把第一列编号数据剔除掉
def data_preprocess(data):
    data.loc[data['好瓜']=='是', '好瓜'] = 1
    data.loc[data['好瓜']=='否', '好瓜'] = 0
    data = data.iloc[:,1:]
    return data

# 判断当前数据集中是否类别全部相同
def is_same_category(D):
    num = D.loc[:,label].nunique()
    if num == 1:
        flag = True
    else:
        flag = False
    return flag, D.iloc[0,-1]

# 判断当前属性集是否为空
def is_attributes_empty(attribute):
    num = attribute.nunique()
    if num == 0:
        return True
    else:
        return False

# 判断当前数据集在属性集A上取值是否相同
def is_same_in_A(D,A):
    num = D.loc[:, A].nunique()
    num = np.sum(num)
    numA = len(A)
    if num == numA:
        return True
    else:
        return False

# 找出数据集D中样本数最多的类别
def find_type_of_D(D):
    good = D.loc[D[label]==1, :].shape[0]
    bad = D.loc[D[label]==0, :].shape[0]
    if good >= bad:
        return '好瓜'
    else:
        return '坏瓜'

# 从数据集D中找出在属性A上取值为a的子集
def get_D_v_from_D(D, A, a):
    Dv = D.loc[D[A]==a, :]
    return Dv

# 计算数据集D的交叉熵
def compute_Cross_Entrype(D):
    num = D.shape[0]
    curEntropy = 0.
    for c in categories:
        c_num = D.loc[D[label]==c, :].shape[0]
        if c_num == 0:
            tmp = 0
        else:
            pc = c_num / num
            tmp = -pc * np.log2(pc)
        curEntropy += tmp
    return curEntropy

# 基于交叉熵找出当前最优属性
def find_best_a_in_A(D, A):
    curEntropy = compute_Cross_Entrype(D)
    improve = 0
    selectedA = ''
    for a in A:
        entropy = 0
        D_num = D.shape[0]
        for av in pd.unique(D.loc[:, a]):
            Dv_a = get_D_v_from_D(D, a, av)
            a_num = Dv_a.shape[0]
            entropy += a_num * compute_Cross_Entrype(Dv_a) / D_num
        if curEntropy - entropy > improve:
            improve = curEntropy - entropy
            selectedA = a
    return selectedA

# 生成树
# 生成树
# 这里与书上不同的是，我除了数据集D和属性集A还传了该节点的父节点和他所对应的属性取值
# 这是为了方便在当前数据集类别全部相同或者属性集为空等情况要生成叶子节点时
# 可以直接创建叶子节点。
def TreeGenerate(D,A,root,lastNode,lastA):
    flag, category = is_same_category(D)
    if flag:
        # root['判定'] = '好瓜' if category==1 else '坏瓜'
        lastNode[lastA] = '好瓜' if category==1 else '坏瓜'
        return
    if is_attributes_empty(A) or is_same_in_A(D,A):
        # root['判定'] = find_type_of_D(D)
        lastNode[lastA] = find_type_of_D(D)
        return
    best_a = find_best_a_in_A(D,A)
    root[best_a] = {}
    for av in pd.unique(data.loc[:,best_a]):
        Dv = get_D_v_from_D(D,best_a,av)
        if Dv.shape[0] == 0:
            root[best_a][av] = find_type_of_D(D)
        else:
            A_ = A.drop(best_a)
            root[best_a][av] = {}
            lastA = av
            # TreeGenerate(Dv,A_,root[best_a][av])
            TreeGenerate(Dv,A_,root[best_a][av],root[best_a],lastA)

结果

可以看到最后生成的字典如下：

{'纹理': {'清晰': {'根蒂': {'蜷缩': '好瓜', '稍蜷': 
{'色泽': {'青绿': '好瓜', '乌黑': {'触感': {'硬滑': '好瓜', '软粘': '坏瓜'}}, '浅白': '好瓜'}}, '硬挺': '坏瓜'}},
 '稍糊': {'触感': {'硬滑': '坏瓜', '软粘': '好瓜'}}, '模糊': '坏瓜'}}

在这里插入图片描述
这里由于时间还没有实现连续值的情况，等过几天有时间了后续会再实现。。。

Futile_H

关注

5
点赞
踩
19

收藏

觉得还不错? 一键收藏
0
评论
西瓜书4.3题python代码实现

西瓜书4.3题python代码实现简介数据集代码实现1、画决策树决策树生成算法：结果简介编程实现基于交叉熵进行划分选择的决策树算法，并为表4.3中数据生成一颗决策树。数据集编号,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.46,是2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,是3,乌黑,蜷缩,浊响,清...
复制链接

扫一扫

专栏目录