简介
编程实现基于交叉熵进行划分选择的决策树算法,并为表4.3中数据生成一颗决策树。
数据集
编号,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.46,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,是
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,0.403,0.237,是
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,是
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,是
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,否
10,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,否
11,浅白,硬挺,清脆,模糊,平坦,硬滑,0.245,0.057,否
12,浅白,蜷缩,浊响,模糊,平坦,软粘,0.343,0.099,否
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,0.639,0.161,否
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,0.657,0.198,否
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,0.36,0.37,否
16,浅白,蜷缩,浊响,模糊,平坦,硬滑,0.593,0.042,否
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,0.719,0.103,否
代码实现
这里的代码主要分成两个部分,一个是画出决策树,一个是生成决策树数据。个人感觉这题难的地方是画出决策树。。。
1、画决策树
这里画决策树的算法是引用了[另外一位大佬](https://blog.csdn.net/weixin_37922777/article/details/88821957)的,然后因为他的算法只能画二叉树,然后自己改了一下。
这里主要是以字典的形式生成树,所以到时候核心算法生成节点的时候用**字典来存储**。<br>
格式大概是这样: {父节点:{{’描述1‘,子节点1},{’描述2‘,子节点2}}}
import matplotlib.pyplot as plt
from pylab import *
# 定义文本框和箭头格式
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
mpl.rcParams['font.sans-serif'] = ['SimHei'] #指定默认字体
mpl.rcParams['axes.unicode_minus'] = False #解决保存图像是负号'-'显示为方块的问题
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString)
def plotNode(nodeTxt, centerPt, parentPt, nodeType): # 绘制带箭头的注解
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords="axes fraction", xytext=centerPt, textcoords="axes fraction", va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
def getNumLeafs(myTree): # 获取叶节点的数目
numLeafs = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
if type(secondDict).__name__ == 'dict':
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
numLeafs += getNumLeafs(secondDict[key])
else: numLeafs += 1
else:
numLeafs += 1
return numLeafs
def getTreeDepth(myTree): # 获取树的层数
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
if type(secondDict).__name__ == 'dict':
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
thisDepth = 1 + getTreeDepth(secondDict[key])
else:
thisDepth = 1
if thisDepth > maxDepth:
maxDepth = thisDepth
else:
maxDepth = 1
return maxDepth
def retrieveTree(i): # 获取预定义的树
listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTrees[i]
def plotTree(myTree, parentPt, nodeTxt):
numLeafs = getNumLeafs(myTree)
getTreeDepth(myTree)
firstStr = list(myTree.keys())[0]
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW,\
plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
if type(secondDict).__name__=='dict':
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':
plotTree(secondDict[key],cntrPt,str(key))
else:
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff),
cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
else:
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict, (plotTree.xOff, plotTree.yOff),cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(secondDict))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1, figsize=(12, 8), facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
plotTree(inTree, (0.5,1.0), '')
plt.show()
决策树生成算法:
这个没什么好说的,就按照树上的伪代码实现就好了,主要要注意的就是创建节点的那几个地方可能需要自己好好琢磨一下,怎么把那个字典构造出来。
import pandas as pd
import numpy as np
# 简单的数据处理,把标签转换为数字格式
# 把第一列编号数据剔除掉
def data_preprocess(data):
data.loc[data['好瓜']=='是', '好瓜'] = 1
data.loc[data['好瓜']=='否', '好瓜'] = 0
data = data.iloc[:,1:]
return data
# 判断当前数据集中是否类别全部相同
def is_same_category(D):
num = D.loc[:,label].nunique()
if num == 1:
flag = True
else:
flag = False
return flag, D.iloc[0,-1]
# 判断当前属性集是否为空
def is_attributes_empty(attribute):
num = attribute.nunique()
if num == 0:
return True
else:
return False
# 判断当前数据集在属性集A上取值是否相同
def is_same_in_A(D,A):
num = D.loc[:, A].nunique()
num = np.sum(num)
numA = len(A)
if num == numA:
return True
else:
return False
# 找出数据集D中样本数最多的类别
def find_type_of_D(D):
good = D.loc[D[label]==1, :].shape[0]
bad = D.loc[D[label]==0, :].shape[0]
if good >= bad:
return '好瓜'
else:
return '坏瓜'
# 从数据集D中找出在属性A上取值为a的子集
def get_D_v_from_D(D, A, a):
Dv = D.loc[D[A]==a, :]
return Dv
# 计算数据集D的交叉熵
def compute_Cross_Entrype(D):
num = D.shape[0]
curEntropy = 0.
for c in categories:
c_num = D.loc[D[label]==c, :].shape[0]
if c_num == 0:
tmp = 0
else:
pc = c_num / num
tmp = -pc * np.log2(pc)
curEntropy += tmp
return curEntropy
# 基于交叉熵找出当前最优属性
def find_best_a_in_A(D, A):
curEntropy = compute_Cross_Entrype(D)
improve = 0
selectedA = ''
for a in A:
entropy = 0
D_num = D.shape[0]
for av in pd.unique(D.loc[:, a]):
Dv_a = get_D_v_from_D(D, a, av)
a_num = Dv_a.shape[0]
entropy += a_num * compute_Cross_Entrype(Dv_a) / D_num
if curEntropy - entropy > improve:
improve = curEntropy - entropy
selectedA = a
return selectedA
# 生成树
# 生成树
# 这里与书上不同的是,我除了数据集D和属性集A还传了该节点的父节点和他所对应的属性取值
# 这是为了方便在当前数据集类别全部相同或者属性集为空等情况要生成叶子节点时
# 可以直接创建叶子节点。
def TreeGenerate(D,A,root,lastNode,lastA):
flag, category = is_same_category(D)
if flag:
# root['判定'] = '好瓜' if category==1 else '坏瓜'
lastNode[lastA] = '好瓜' if category==1 else '坏瓜'
return
if is_attributes_empty(A) or is_same_in_A(D,A):
# root['判定'] = find_type_of_D(D)
lastNode[lastA] = find_type_of_D(D)
return
best_a = find_best_a_in_A(D,A)
root[best_a] = {}
for av in pd.unique(data.loc[:,best_a]):
Dv = get_D_v_from_D(D,best_a,av)
if Dv.shape[0] == 0:
root[best_a][av] = find_type_of_D(D)
else:
A_ = A.drop(best_a)
root[best_a][av] = {}
lastA = av
# TreeGenerate(Dv,A_,root[best_a][av])
TreeGenerate(Dv,A_,root[best_a][av],root[best_a],lastA)
结果
- 可以看到最后生成的字典如下:
{'纹理': {'清晰': {'根蒂': {'蜷缩': '好瓜', '稍蜷':
{'色泽': {'青绿': '好瓜', '乌黑': {'触感': {'硬滑': '好瓜', '软粘': '坏瓜'}}, '浅白': '好瓜'}}, '硬挺': '坏瓜'}},
'稍糊': {'触感': {'硬滑': '坏瓜', '软粘': '好瓜'}}, '模糊': '坏瓜'}}
这里由于时间还没有实现连续值的情况,等过几天有时间了后续会再实现。。。