使用numpy来实现ID3、C45
重要代码
import math
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文
plt.rcParams['axes.unicode_minus'] = False # 显示负号
# 创建数据集 备注 李航《统计学习方法》中表5.1 贷款申请数据数据
def createDataLH():
"""
:Author: WangBlue
:Create: 2022/8/15 11:26
:func:创建李航书籍中的数据集
:return: 数据集,分类标签,特征标签
"""
data = np.array([['青年', '否', '否', '一般']])
data = np.append(data, [['青年', '否', '否', '好']], axis=0)
data = np.append(data, [['青年', '是', '否', '好']
, ['青年', '是', '是', '一般']
, ['青年', '否', '否', '一般']
, ['中年', '否', '否', '一般']
, ['中年', '否', '否', '好']
, ['中年', '是', '是', '好']
, ['中年', '否', '是', '非常好']
, ['中年', '否', '是', '非常好']
, ['老年', '否', '是', '非常好']
, ['老年', '否', '是', '好']
, ['老年', '是', '否', '好']
, ['老年', '是', '否', '非常好']
, ['老年', '否', '否', '一般']
], axis=0)
label = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'])
name = np.array(['年龄', '有工作', '有房子', '信贷情况'])
return data, label, name
# 创建西瓜书数据集2.0
def createDataXIGua():
"""
:Author: WangBlue
:Create: 2022/8/15 11:27
:func:创建西瓜书中数据集
:return: 数据集,分类标签,特征标签
"""
data = np.array([['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
, ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
, ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
, ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘']
, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑']
, ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑']
, ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘']
, ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑']
, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘']
, ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑']
, ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑']
, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑']
, ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑']])
label = np.array(['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否'])
name = np.array(['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'])
return data, label, name
def splitXgData20(xgData, xgLabel):
"""
:Author: WangBlue
:Create: 2022/8/15 11:32
:func:划分数据集
:param xgData: 西瓜数据集
:param xgLabel: 西瓜标签
:return: 测试集、训练集、测试标签、训练标签
"""
# [0, 1,.....] 是下标
xgDataTrain = xgData[[0, 1, 2, 5, 6, 9, 13, 14, 15, 16], :]
xgDataTest = xgData[[3, 4, 7, 8, 10, 11, 12], :]
xgLabelTrain = xgLabel[[0, 1, 2, 5, 6, 9, 13, 14, 15, 16]]
xgLabelTest = xgLabel[[3, 4, 7, 8, 10, 11, 12]]
return xgDataTrain, xgLabelTrain, xgDataTest, xgLabelTest
def equalNums(mtri, value):
"""
:Author: WangBlue
:Create: 2022/8/15 11:38
:func:用来求numpy array中数值等于某值的元素数量
:param mtri:numpy类型的矩阵
:param value:要求的值
:return:
"""
if mtri is None:
value = 0
else:
value = mtri[mtri == value].size
return value
def infoEntropy(mtri):
"""
:Author: WangBlue
:Create: 2022/8/15 11:35
:func:计算某个序列的信息熵
:param mtri:矩阵
:return:信息熵
"""
# 转换为 numpy 矩阵
mtri = np.asarray(mtri)
# 取所有不同值
xValues = set(mtri)
# 计算熵值
entropy = 0
for xValue in xValues:
p = equalNums(mtri, xValue) / mtri.size
entropy -= p * math.log(p, 2)
return entropy
def conditionnalEntropy(feature, y):
"""
:Author: WangBlue
:Create: 2022/8/15 11:45
:func:计算 某特征feature 条件下y的信息熵
:param feature: 特征一
:param y: 特征二
:return:
"""
# 转换为numpy
feature = np.asarray(feature)
y = np.asarray(y)
# 取特征的不同值
featureValues = set(feature)
# 计算熵值
entropy = 0
for feat in featureValues:
# 解释:feature == feat 是得到取feature中所有元素值等于feat的元素的索引(类似这样理解)
# y[feature == feat] 是取y中 feature元素值等于feat的元素索引的 y的元素的子集
p = equalNums(feature, feat) / feature.size
entropy += p * infoEntropy(y[feature == feat])
return entropy
def infoGain(feature, y):
"""
:Author: WangBlue
:Create: 2022/8/15 11:50
:func:计算信息增益
:param feature: 特征一
:param y: 特征二
:return: 信息增益
"""
return infoEntropy(y) - conditionnalEntropy(feature, y)
def infoGainRatio(feature, y):
"""
:Author: WangBlue
:Create: 2022/8/15 11:51
:func:计算信息增益率
:param feature: 特征一
:param y: 特征二
:return: 信息增益率
"""
if infoEntropy(feature) == 0:
return 0
else:
IGR = float(infoGain(feature, y)) / infoEntropy(feature)
return IGR
def bestFeature(data, labels, method='ID3'):
"""
:Author: WangBlue
:Create: 2022/8/15 11:56
:func:特征选取
:param data: 数据集
:param labels: 分类标签
:param method: 方式
:return: 选取好的特征
"""
assert method in ['ID3', 'C45'], "method 须为ID3或C45"
data = np.asarray(data)
labels = np.asarray(labels)
# 根据输入的method选取 评估特征的方法:ID3 -> 信息增益; C45 -> 信息增益率
def calcEnt(feature, labels):
if method == 'ID3':
return infoGain(feature, labels)
elif method == 'C45':
return infoGainRatio(feature, labels)
# 特征数量 即 data 的列数量
featureNum = data.shape[1]
# 计算最佳特征
bestEnt = 0
bestFeat = -1
for feature in range(featureNum):
ent = calcEnt(data[:, feature], labels)
if ent >= bestEnt:
bestEnt = ent
bestFeat = feature
# print("feature " + str(feature + 1) + " ent: " + str(ent)+ "\t bestEnt: " + str(bestEnt))
return bestFeat, bestEnt
def splitFeatureData(data, labels, feature):
"""
:Author: WangBlue
:Create: 2022/8/15 14:39
:func:根据特征及特征值分割原数据集 删除data中的feature列,
并根据feature列中的值分割 data和label
:param data: 数据集
:param labels: 分类标签
:param feature: 特征
:return: 已划分好的数据集和标签集,类型是字典
"""
# 取特征列
features = np.asarray(data)[:, feature]
# 数据集中删除特征列
data = np.delete(np.asarray(data), feature, axis=1)
# 标签
labels = np.asarray(labels)
uniqFeatures = set(features)
dataSet = {}
labelSet = {}
for feat in uniqFeatures:
dataSet[feat] = data[features == feat]
labelSet[feat] = labels[features == feat]
return dataSet, labelSet
def voteLabel(labels):
"""
:Author: WangBlue
:Create: 2022/8/15 14:42
:func:通过投票选出,分类标签
:param labels:分类标签
:return: 通过投票标记后的标签
"""
uniqLabels = list(set(labels))
labels = np.asarray(labels)
labelNum = []
for label in uniqLabels:
# 统计每个标签值得数量
labelNum.append(equalNums(labels, label))
# 返回数量最大的标签
return uniqLabels[labelNum.index(max(labelNum))]
# 创建基础决策树
def createTree(data, labels, names, method='ID3'):
"""
:Author: WangBlue
:Create: 2022/8/15 14:45
:func:创建基本的决策树
:param data: 数据集
:param labels: 分类标签
:param names: 特征名称
:param method: 划分数据集的方法,默认为ID3
:return: 决策树
"""
data = np.asarray(data)
labels = np.asarray(labels)
names = np.asarray(names)
# 如果结果为单一结果
if len(set(labels)) == 1:
return labels[0]
# 如果没有待分类特征
elif data.size == 0:
return voteLabel(labels)
# 其他情况则选取特征
bestFeat, bestEnt = bestFeature(data, labels, method=method)
# 取特征名称
bestFeatName = names[bestFeat]
# 从特征名称列表删除已取得特征名称
names = np.delete(names, [bestFeat])
# 根据选取的特征名称创建树节点
decisionTree = {bestFeatName: {}}
# 根据最优特征进行分割
dataSet, labelSet = splitFeatureData(data, labels, bestFeat)
# 对最优特征的每个特征值所分的数据子集进行计算
for featValue in dataSet.keys():
decisionTree[bestFeatName][featValue] = createTree(dataSet.get(featValue), labelSet.get(featValue), names,
method)
return decisionTree
画树:
# 画树
def getTreeSize(decisionTree):
"""
:Author: WangBlue
:Create: 2022/8/15 14:50
:func:树信息统计 叶子节点数量 和 树深度
:param decisionTree:已经创建好的树
:return: 树信息统计 叶子节点数量 和 树深度
"""
nodeName = list(decisionTree.keys())[0]
nodeValue = decisionTree[nodeName]
leafNum = 0
treeDepth = 0
leafDepth = 0
for val in nodeValue.keys():
if type(nodeValue[val]) == dict:
leafNum += getTreeSize(nodeValue[val])[0]
leafDepth = 1 + getTreeSize(nodeValue[val])[1]
else:
leafNum += 1
leafDepth = 1
treeDepth = max(treeDepth, leafDepth)
return leafNum, treeDepth, leafDepth
def dtClassify(decisionTree, rowData, names):
"""
:Author: WangBlue
:Create: 2022/8/15 14:52
:func:使用已经创建好的树模型,对其他数据进行分类
:param decisionTree:已经创建好的决策树
:param rowData:数据集
:param names: 标签名称
:return:已分类好的标签
"""
global classLabel
names = list(names)
# 获取特征
feature = list(decisionTree.keys())[0]
# 决策树对于该特征的值的判断字段
featDict = decisionTree[feature]
# 获取特征的列
feat = names.index(feature)
# 获取数据该特征的值
featVal = rowData[feat]
# 根据特征值查找结果,如果结果是字典说明是子树,调用本函数递归
if featVal in featDict.keys():
if type(featDict[featVal]) == dict:
classLabel = dtClassify(featDict[featVal], rowData, names)
else:
classLabel = featDict[featVal]
return classLabel
def plotNode(nodeText, centerPt, parentPt, nodeStyle):
"""
:Author: WangBlue
:Create: 2022/8/15 15:09
:func:画节点
:param nodeText: 结点文本信息
:param centerPt: 中心结点
:param parentPt: 父结点
:param nodeStyle: 结点的形式
:return: 无
"""
arrowArgs = {"arrowstyle": "<-"} # 树连接的类型
createPlot.ax1.annotate(nodeText, xy=parentPt, xycoords="axes fraction", xytext=centerPt
, textcoords="axes fraction", va="center", ha="center", bbox=nodeStyle,
arrowprops=arrowArgs)
def plotMidText(centerPt, parentPt, lineText):
"""
:Author: WangBlue
:Create: 2022/8/15 15:11
:func:添加箭头上的标注文字
:param centerPt: 中心结点
:param parentPt: 父结点
:param lineText: 标注文本
:return: 无
"""
xMid = (centerPt[0] + parentPt[0]) / 2.0
yMid = (centerPt[1] + parentPt[1]) / 2.0
createPlot.ax1.text(xMid, yMid, lineText)
def plotTree(decisionTree, parentPt, parentValue):
"""
:Author: WangBlue
:Create: 2022/8/15 15:12
:func:画树
:param decisionTree: 已经画好的决策树
:param parentPt: 父结点
:param parentValue: 父结点的值
:return: 无
"""
decisionNodeStyle = dict(boxstyle="sawtooth", fc="0.8") # 结点的形式
leafNodeStyle = {"boxstyle": "round4", "fc": "0.8"} # 叶子结点的信息
# 计算宽与高
leafNum, treeDepth, leafDepth = getTreeSize(decisionTree)
# 在 1 * 1 的范围内画图,因此分母为 1
# 每个叶节点之间的偏移量
plotTree.xOff = plotTree.figSize / (plotTree.totalLeaf - 1)
# 每一层的高度偏移量
plotTree.yOff = plotTree.figSize / plotTree.totalDepth
# 节点名称
nodeName = list(decisionTree.keys())[0]
# 根节点的起止点相同,可避免画线;如果是中间节点,则从当前叶节点的位置开始,
# 然后加上本次子树的宽度的一半,则为决策节点的横向位置
centerPt = (plotTree.x + (leafNum - 1) * plotTree.xOff / 2.0, plotTree.y)
# 画出该决策节点
plotNode(nodeName, centerPt, parentPt, decisionNodeStyle)
# 标记本节点对应父节点的属性值
plotMidText(centerPt, parentPt, parentValue)
# 取本节点的属性值
treeValue = decisionTree[nodeName]
# 下一层各节点的高度
plotTree.y = plotTree.y - plotTree.yOff
# 绘制下一层
for val in treeValue.keys():
# 如果属性值对应的是字典,说明是子树,进行递归调用; 否则则为叶子节点
if type(treeValue[val]) == dict:
plotTree(treeValue[val], centerPt, str(val))
else:
plotNode(treeValue[val], (plotTree.x, plotTree.y), centerPt, leafNodeStyle)
plotMidText((plotTree.x, plotTree.y), centerPt, str(val))
# 移到下一个叶子节点
plotTree.x = plotTree.x + plotTree.xOff
# 递归完成后返回上一层
plotTree.y = plotTree.y + plotTree.yOff
def createPlot(decisionTree):
"""
:Author: WangBlue
:Create: 2022/8/15 15:14
:func:画出决策树
:param decisionTree: 决策树
:return: 无
"""
fig = plt.figure(1, facecolor="white")
fig.clf()
axprops = {"xticks": [], "yticks": []}
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
# 定义画图的图形尺寸
plotTree.figSize = 1.5
# 初始化树的总大小
plotTree.totalLeaf, plotTree.totalDepth, plotTree.leafDepth = getTreeSize(decisionTree)
# 叶子节点的初始位置x 和 根节点的初始层高度y
plotTree.x = 0
plotTree.y = plotTree.figSize
plotTree(decisionTree, (plotTree.figSize / 2.0, plotTree.y), "")
plt.show()
防止过拟合(基于ID3和C45来做的)
剪枝:
剪枝是决策树学习算法中对付过拟合的主要手段。通过剪枝可以主动去掉一些分支来降低过拟合的风险
剪枝策略
预剪枝:
预剪枝是指在决策树生成过程中,对每个结点在划 分前先进行估计,若当前结点的划分不能带来决策树泛化性能提升,则停止划 分并将当前结点标记为叶结点。
预剪枝基于”贪心“本质禁止这些分支展开,给预剪枝决策树带来了欠拟合的风险
预剪枝实现:
# 创建预剪枝决策树
def createTreePrePruning(dataTrain, labelTrain, dataTest, labelTest, names, method='ID3'):
"""
:Author: WangBlue
:Create: 2022/8/15 15:20
:func: 创建预剪枝决策树
:param dataTrain: 训练集
:param labelTrain: 训练标签
:param dataTest: 测试集
:param labelTest: 测试集标签
:param names: 特征名称
:param method: 使用ID3还是C45
:return: 预剪枝决策树
"""
dataTestSet = {}
labelTestSet = {}
labelTestRatioPre = 0.0
labelTestRatioPost = 0.0
trainData = np.asarray(dataTrain)
labelTrain = np.asarray(labelTrain)
testData = np.asarray(dataTest)
labelTest = np.asarray(labelTest)
names = np.asarray(names)
# 如果结果为单一结果
if len(set(labelTrain)) == 1:
return labelTrain[0]
# 如果没有待分类特征
elif trainData.size == 0:
return voteLabel(labelTrain)
# 其他情况则选取特征
bestFeat, bestEnt = bestFeature(dataTrain, labelTrain, method=method)
# 取特征名称
bestFeatName = names[bestFeat]
# 从特征名称列表删除已取得特征名称
names = np.delete(names, [bestFeat])
# 根据最优特征进行分割
dataTrainSet, labelTrainSet = splitFeatureData(dataTrain, labelTrain, bestFeat)
# 预剪枝评估
# 划分前的分类标签
labelTrainLabelPre = voteLabel(labelTrain)
labelTrainRatioPre = equalNums(labelTrain, labelTrainLabelPre) / labelTrain.size
# 划分后的精度计算
if dataTest is not None:
dataTestSet, labelTestSet = splitFeatureData(dataTest, labelTest, bestFeat)
# 划分前的测试标签正确比例
labelTestRatioPre = equalNums(labelTest, labelTrainLabelPre) / labelTest.size
# 划分后 每个特征值的分类标签正确的数量
labelTrainEqNumPost = 0
for val in labelTrainSet.keys():
labelTrainEqNumPost += equalNums(labelTestSet.get(val), voteLabel(labelTrainSet.get(val))) + 0.0
# 划分后 正确的比例
labelTestRatioPost = labelTrainEqNumPost / labelTest.size
# 如果没有评估数据 但划分前的精度等于最小值0.5 则继续划分
if dataTest is None and labelTrainRatioPre == 0.5:
decisionTree = {bestFeatName: {}}
for featValue in dataTrainSet.keys():
decisionTree[bestFeatName][featValue] = createTreePrePruning(dataTrainSet.get(featValue),
labelTrainSet.get(featValue)
, None, None, names, method)
elif dataTest is None:
return labelTrainLabelPre
# 如果划分后的精度相比划分前的精度下降, 则直接作为叶子节点返回
elif labelTestRatioPost < labelTestRatioPre:
return labelTrainLabelPre
else:
# 根据选取的特征名称创建树节点
decisionTree = {bestFeatName: {}}
# 对最优特征的每个特征值所分的数据子集进行计算
for featValue in dataTrainSet.keys():
decisionTree[bestFeatName][featValue] = createTreePrePruning(dataTrainSet.get(featValue),
labelTrainSet.get(featValue)
, dataTestSet.get(featValue),
labelTestSet.get(featValue)
, names, method)
return decisionTree
后剪枝:
后剪枝则是先从训练集生成一棵完整的决策树, 然后自底向上地对非叶结点进行考察,若将该结点对应的子树替换为叶结点能 带来决策树泛化性能提升,则将该子树替换为叶结点。
后剪枝决策树通常比预剪枝决策树保留 了更 多的分支 一般情形下,后剪枝决策树的欠拟合风险很小,泛化 能往往优于预剪枝决策树.但后剪枝过程是在生成完全决策树之后进行的 并且要白底向上 对树中的所有非叶结点进行逐考察,因此其训练时间开销比未剪枝决策树 和预剪枝决策树都要大得多
后剪枝评估时需要划分前的标签,这里思考两种方法:
一是,不改变原来的训练函数,评估时使用训练数据对划分前的节点标签重新打标
二是,改进训练函数,在训练的同时为每个节点增加划分前的标签,这样可以保证评估时只使用测试数据,避免再次使用大量的训练数据
这里采用第二种方法 写新的函数 createTreeWithLabel,当然也可以修改createTree来添加参数实现
后剪枝实现:
# 后剪枝
def createTreeWithLabel(data, labels, names, method='ID3'):
data = np.asarray(data)
labels = np.asarray(labels)
names = np.asarray(names)
# 如果不划分的标签为
votedLabel = voteLabel(labels)
# 如果结果为单一结果
if len(set(labels)) == 1:
return votedLabel
# 如果没有待分类特征
elif data.size == 0:
return votedLabel
# 其他情况则选取特征
bestFeat, bestEnt = bestFeature(data, labels, method=method)
# 取特征名称
bestFeatName = names[bestFeat]
# 从特征名称列表删除已取得特征名称
names = np.delete(names, [bestFeat])
# 根据选取的特征名称创建树节点 划分前的标签votedPreDivisionLabel=_vpdl
decisionTree = {bestFeatName: {"_vpdl": votedLabel}}
# 根据最优特征进行分割
dataSet, labelSet = splitFeatureData(data, labels, bestFeat)
# 对最优特征的每个特征值所分的数据子集进行计算
for featValue in dataSet.keys():
decisionTree[bestFeatName][featValue] = createTreeWithLabel(dataSet.get(featValue), labelSet.get(featValue),
names, method)
return decisionTree
# 将带预划分标签的tree转化为常规的tree
def convertTree(labeledTree):
labeledTreeNew = labeledTree.copy()
nodeName = list(labeledTree.keys())[0]
labeledTreeNew[nodeName] = labeledTree[nodeName].copy()
for val in list(labeledTree[nodeName].keys()):
if val == "_vpdl":
labeledTreeNew[nodeName].pop(val)
a = labeledTree[nodeName][val]
elif type(labeledTree[nodeName][val]) == dict:
labeledTreeNew[nodeName][val] = convertTree(labeledTree[nodeName][val])
return labeledTreeNew
# 后剪枝 训练完成后决策节点进行替换评估 这里可以直接对xgTreeTrain进行操作
def treePostPruning(labeledTree, dataTest, labelTest, names):
newTree = labeledTree.copy()
dataTest = np.asarray(dataTest)
labelTest = np.asarray(labelTest)
names = np.asarray(names)
# 取决策节点的名称 即特征的名称
featName = list(labeledTree.keys())[0]
# print("\n当前节点:" + featName)
# 取特征的列
featCol = np.argwhere(names == featName)[0][0]# [[4]]因为是二维矩阵,所以我们要取得零行零列
names = np.delete(names, [featCol])
# print("当前节点划分的数据维度:" + str(names))
# print("当前节点划分的数据:" )
# print(dataTest)
# print(labelTest)
# 该特征下所有值的字典
newTree[featName] = labeledTree[featName].copy()
featValueDict = newTree[featName]
featPreLabel = featValueDict.pop("_vpdl")
# print("当前节点预划分标签:" + featPreLabel)
# 是否为子树的标记
subTreeFlag = 0
dataTestSet = {}
labelTestSet = {}
# 分割测试数据 如果有数据 则进行测试或递归调用 np的array我不知道怎么判断是否None, 用is None是错的
dataFlag = 1 if sum(dataTest.shape) > 0 else 0
if dataFlag == 1:
# print("当前节点有划s分数据!")
dataTestSet, labelTestSet = splitFeatureData(dataTest, labelTest, featCol)
for featValue in featValueDict.keys():
print(featValue)
# print("当前节点属性 {0} 的子节点:{1}".format(featValue ,str(featValueDict[featValue])))
if dataFlag == 1 and type(featValueDict[featValue]) == dict:
subTreeFlag = 1
# 如果是子树则递归
newTree[featName][featValue] = treePostPruning(featValueDict[featValue], dataTestSet.get(featValue),
labelTestSet.get(featValue), names)
print(dataTestSet.get(featValue))
# 如果递归后为叶子 则后续进行评估
if type(featValueDict[featValue]) != dict:
subTreeFlag = 0
# 如果没有数据 则转换子树
if dataFlag == 0 and type(featValueDict[featValue]) == dict:
subTreeFlag = 1
# print("当前节点无划分数据!直接转换树:"+str(featValueDict[featValue]))
newTree[featName][featValue] = convertTree(featValueDict[featValue])
# print("转换结果:" + str(convertTree(featValueDict[featValue])))
# 如果全为叶子节点, 评估需要划分前的标签,这里思考两种方法,
# 一是,不改变原来的训练函数,评估时使用训练数据对划分前的节点标签重新打标
# 二是,改进训练函数,在训练的同时为每个节点增加划分前的标签,这样可以保证评估时只使用测试数据,避免再次使用大量的训练数据
# 这里考虑第二种方法 写新的函数 createTreeWithLabel,当然也可以修改createTree来添加参数实现
if subTreeFlag == 0:
ratioPreDivision = equalNums(labelTest, featPreLabel) / labelTest.size
equalNum = 0
for val in labelTestSet.keys():
equalNum += equalNums(labelTestSet[val], featValueDict[val])
ratioAfterDivision = equalNum / labelTest.size
# print("当前节点预划分标签的准确率:" + str(ratioPreDivision))
# print("当前节点划分后的准确率:" + str(ratioAfterDivision))
# 如果划分后的测试数据准确率低于划分前的,则划分无效,进行剪枝,即使节点等于预划分标签
# 注意这里取的是小于,如果有需要 也可以取 小于等于
if ratioAfterDivision < ratioPreDivision:
newTree = featPreLabel
return newTree
全部代码:
import math
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] # 显示中文
plt.rcParams['axes.unicode_minus'] = False # 显示负号
# 创建数据集 备注 李航《统计学习方法》中表5.1 贷款申请数据数据
def createDataLH():
"""
:Author: WangBlue
:Create: 2022/8/15 11:26
:func:创建李航书籍中的数据集
:return: 数据集,分类标签,特征标签
"""
data = np.array([['青年', '否', '否', '一般']])
data = np.append(data, [['青年', '否', '否', '好']], axis=0)
data = np.append(data, [['青年', '是', '否', '好']
, ['青年', '是', '是', '一般']
, ['青年', '否', '否', '一般']
, ['中年', '否', '否', '一般']
, ['中年', '否', '否', '好']
, ['中年', '是', '是', '好']
, ['中年', '否', '是', '非常好']
, ['中年', '否', '是', '非常好']
, ['老年', '否', '是', '非常好']
, ['老年', '否', '是', '好']
, ['老年', '是', '否', '好']
, ['老年', '是', '否', '非常好']
, ['老年', '否', '否', '一般']
], axis=0)
label = np.array(['否', '否', '是', '是', '否', '否', '否', '是', '是', '是', '是', '是', '是', '是', '否'])
name = np.array(['年龄', '有工作', '有房子', '信贷情况'])
return data, label, name
# 创建西瓜书数据集2.0
def createDataXIGua():
"""
:Author: WangBlue
:Create: 2022/8/15 11:27
:func:创建西瓜书中数据集
:return: 数据集,分类标签,特征标签
"""
data = np.array([['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
, ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑']
, ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑']
, ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
, ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘']
, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑']
, ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑']
, ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘']
, ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑']
, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘']
, ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑']
, ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑']
, ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘']
, ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑']
, ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑']])
label = np.array(['是', '是', '是', '是', '是', '是', '是', '是', '否', '否', '否', '否', '否', '否', '否', '否', '否'])
name = np.array(['色泽', '根蒂', '敲声', '纹理', '脐部', '触感'])
return data, label, name
def splitXgData20(xgData, xgLabel):
"""
:Author: WangBlue
:Create: 2022/8/15 11:32
:func:划分数据集
:param xgData: 西瓜数据集
:param xgLabel: 西瓜标签
:return: 测试集、训练集、测试标签、训练标签
"""
# [0, 1,.....] 是下标
xgDataTrain = xgData[[0, 1, 2, 5, 6, 9, 13, 14, 15, 16], :]
xgDataTest = xgData[[3, 4, 7, 8, 10, 11, 12], :]
xgLabelTrain = xgLabel[[0, 1, 2, 5, 6, 9, 13, 14, 15, 16]]
xgLabelTest = xgLabel[[3, 4, 7, 8, 10, 11, 12]]
return xgDataTrain, xgLabelTrain, xgDataTest, xgLabelTest
def equalNums(mtri, value):
"""
:Author: WangBlue
:Create: 2022/8/15 11:38
:func:用来求numpy array中数值等于某值的元素数量
:param mtri:numpy类型的矩阵
:param value:要求的值
:return:
"""
if mtri is None:
value = 0
else:
value = mtri[mtri == value].size
return value
def infoEntropy(mtri):
"""
:Author: WangBlue
:Create: 2022/8/15 11:35
:func:计算某个序列的信息熵
:param mtri:矩阵
:return:信息熵
"""
# 转换为 numpy 矩阵
mtri = np.asarray(mtri)
# 取所有不同值
xValues = set(mtri)
# 计算熵值
entropy = 0
for xValue in xValues:
p = equalNums(mtri, xValue) / mtri.size
entropy -= p * math.log(p, 2)
return entropy
def conditionnalEntropy(feature, y):
"""
:Author: WangBlue
:Create: 2022/8/15 11:45
:func:计算 某特征feature 条件下y的信息熵
:param feature: 特征一
:param y: 特征二
:return:
"""
# 转换为numpy
feature = np.asarray(feature)
y = np.asarray(y)
# 取特征的不同值
featureValues = set(feature)
# 计算熵值
entropy = 0
for feat in featureValues:
# 解释:feature == feat 是得到取feature中所有元素值等于feat的元素的索引(类似这样理解)
# y[feature == feat] 是取y中 feature元素值等于feat的元素索引的 y的元素的子集
p = equalNums(feature, feat) / feature.size
entropy += p * infoEntropy(y[feature == feat])
return entropy
def infoGain(feature, y):
"""
:Author: WangBlue
:Create: 2022/8/15 11:50
:func:计算信息增益
:param feature: 特征一
:param y: 特征二
:return: 信息增益
"""
return infoEntropy(y) - conditionnalEntropy(feature, y)
def infoGainRatio(feature, y):
"""
:Author: WangBlue
:Create: 2022/8/15 11:51
:func:计算信息增益率
:param feature: 特征一
:param y: 特征二
:return: 信息增益率
"""
if infoEntropy(feature) == 0:
return 0
else:
IGR = float(infoGain(feature, y)) / infoEntropy(feature)
return IGR
def bestFeature(data, labels, method='ID3'):
"""
:Author: WangBlue
:Create: 2022/8/15 11:56
:func:特征选取
:param data: 数据集
:param labels: 分类标签
:param method: 方式
:return: 选取好的特征
"""
assert method in ['ID3', 'C45'], "method 须为ID3或C45"
data = np.asarray(data)
labels = np.asarray(labels)
# 根据输入的method选取 评估特征的方法:ID3 -> 信息增益; C45 -> 信息增益率
def calcEnt(feature, labels):
if method == 'ID3':
return infoGain(feature, labels)
elif method == 'C45':
return infoGainRatio(feature, labels)
# 特征数量 即 data 的列数量
featureNum = data.shape[1]
# 计算最佳特征
bestEnt = 0
bestFeat = -1
for feature in range(featureNum):
ent = calcEnt(data[:, feature], labels)
if ent >= bestEnt:
bestEnt = ent
bestFeat = feature
# print("feature " + str(feature + 1) + " ent: " + str(ent)+ "\t bestEnt: " + str(bestEnt))
return bestFeat, bestEnt
def splitFeatureData(data, labels, feature):
"""
:Author: WangBlue
:Create: 2022/8/15 14:39
:func:根据特征及特征值分割原数据集 删除data中的feature列,
并根据feature列中的值分割 data和label
:param data: 数据集
:param labels: 分类标签
:param feature: 特征
:return: 已划分好的数据集和标签集,类型是字典
"""
# 取特征列
features = np.asarray(data)[:, feature]
# 数据集中删除特征列
data = np.delete(np.asarray(data), feature, axis=1)
# 标签
labels = np.asarray(labels)
uniqFeatures = set(features)
dataSet = {}
labelSet = {}
for feat in uniqFeatures:
dataSet[feat] = data[features == feat]
labelSet[feat] = labels[features == feat]
return dataSet, labelSet
def voteLabel(labels):
"""
:Author: WangBlue
:Create: 2022/8/15 14:42
:func:通过投票选出,分类标签
:param labels:分类标签
:return: 通过投票标记后的标签
"""
uniqLabels = list(set(labels))
labels = np.asarray(labels)
labelNum = []
for label in uniqLabels:
# 统计每个标签值得数量
labelNum.append(equalNums(labels, label))
# 返回数量最大的标签
return uniqLabels[labelNum.index(max(labelNum))]
# 创建基础决策树
def createTree(data, labels, names, method='ID3'):
"""
:Author: WangBlue
:Create: 2022/8/15 14:45
:func:创建基本的决策树
:param data: 数据集
:param labels: 分类标签
:param names: 特征名称
:param method: 划分数据集的方法,默认为ID3
:return: 决策树
"""
data = np.asarray(data)
labels = np.asarray(labels)
names = np.asarray(names)
# 如果结果为单一结果
if len(set(labels)) == 1:
return labels[0]
# 如果没有待分类特征
elif data.size == 0:
return voteLabel(labels)
# 其他情况则选取特征
bestFeat, bestEnt = bestFeature(data, labels, method=method)
# 取特征名称
bestFeatName = names[bestFeat]
# 从特征名称列表删除已取得特征名称
names = np.delete(names, [bestFeat])
# 根据选取的特征名称创建树节点
decisionTree = {bestFeatName: {}}
# 根据最优特征进行分割
dataSet, labelSet = splitFeatureData(data, labels, bestFeat)
# 对最优特征的每个特征值所分的数据子集进行计算
for featValue in dataSet.keys():
decisionTree[bestFeatName][featValue] = createTree(dataSet.get(featValue), labelSet.get(featValue), names,
method)
return decisionTree
# 画树
def getTreeSize(decisionTree):
"""
:Author: WangBlue
:Create: 2022/8/15 14:50
:func:树信息统计 叶子节点数量 和 树深度
:param decisionTree:已经创建好的树
:return: 树信息统计 叶子节点数量 和 树深度
"""
nodeName = list(decisionTree.keys())[0]
nodeValue = decisionTree[nodeName]
leafNum = 0
treeDepth = 0
leafDepth = 0
for val in nodeValue.keys():
if type(nodeValue[val]) == dict:
leafNum += getTreeSize(nodeValue[val])[0]
leafDepth = 1 + getTreeSize(nodeValue[val])[1]
else:
leafNum += 1
leafDepth = 1
treeDepth = max(treeDepth, leafDepth)
return leafNum, treeDepth, leafDepth
def dtClassify(decisionTree, rowData, names):
"""
:Author: WangBlue
:Create: 2022/8/15 14:52
:func:使用已经创建好的树模型,对其他数据进行分类
:param decisionTree:已经创建好的决策树
:param rowData:数据集
:param names: 标签名称
:return:已分类好的标签
"""
global classLabel
names = list(names)
# 获取特征
feature = list(decisionTree.keys())[0]
# 决策树对于该特征的值的判断字段
featDict = decisionTree[feature]
# 获取特征的列
feat = names.index(feature)
# 获取数据该特征的值
featVal = rowData[feat]
# 根据特征值查找结果,如果结果是字典说明是子树,调用本函数递归
if featVal in featDict.keys():
if type(featDict[featVal]) == dict:
classLabel = dtClassify(featDict[featVal], rowData, names)
else:
classLabel = featDict[featVal]
return classLabel
def plotNode(nodeText, centerPt, parentPt, nodeStyle):
"""
:Author: WangBlue
:Create: 2022/8/15 15:09
:func:画节点
:param nodeText: 结点文本信息
:param centerPt: 中心结点
:param parentPt: 父结点
:param nodeStyle: 结点的形式
:return: 无
"""
arrowArgs = {"arrowstyle": "<-"} # 树连接的类型
createPlot.ax1.annotate(nodeText, xy=parentPt, xycoords="axes fraction", xytext=centerPt
, textcoords="axes fraction", va="center", ha="center", bbox=nodeStyle,
arrowprops=arrowArgs)
def plotMidText(centerPt, parentPt, lineText):
"""
:Author: WangBlue
:Create: 2022/8/15 15:11
:func:添加箭头上的标注文字
:param centerPt: 中心结点
:param parentPt: 父结点
:param lineText: 标注文本
:return: 无
"""
xMid = (centerPt[0] + parentPt[0]) / 2.0
yMid = (centerPt[1] + parentPt[1]) / 2.0
createPlot.ax1.text(xMid, yMid, lineText)
def plotTree(decisionTree, parentPt, parentValue):
"""
:Author: WangBlue
:Create: 2022/8/15 15:12
:func:画树
:param decisionTree: 已经画好的决策树
:param parentPt: 父结点
:param parentValue: 父结点的值
:return: 无
"""
decisionNodeStyle = dict(boxstyle="sawtooth", fc="0.8") # 结点的形式
leafNodeStyle = {"boxstyle": "round4", "fc": "0.8"} # 叶子结点的信息
# 计算宽与高
leafNum, treeDepth, leafDepth = getTreeSize(decisionTree)
# 在 1 * 1 的范围内画图,因此分母为 1
# 每个叶节点之间的偏移量
plotTree.xOff = plotTree.figSize / (plotTree.totalLeaf - 1)
# 每一层的高度偏移量
plotTree.yOff = plotTree.figSize / plotTree.totalDepth
# 节点名称
nodeName = list(decisionTree.keys())[0]
# 根节点的起止点相同,可避免画线;如果是中间节点,则从当前叶节点的位置开始,
# 然后加上本次子树的宽度的一半,则为决策节点的横向位置
centerPt = (plotTree.x + (leafNum - 1) * plotTree.xOff / 2.0, plotTree.y)
# 画出该决策节点
plotNode(nodeName, centerPt, parentPt, decisionNodeStyle)
# 标记本节点对应父节点的属性值
plotMidText(centerPt, parentPt, parentValue)
# 取本节点的属性值
treeValue = decisionTree[nodeName]
# 下一层各节点的高度
plotTree.y = plotTree.y - plotTree.yOff
# 绘制下一层
for val in treeValue.keys():
# 如果属性值对应的是字典,说明是子树,进行递归调用; 否则则为叶子节点
if type(treeValue[val]) == dict:
plotTree(treeValue[val], centerPt, str(val))
else:
plotNode(treeValue[val], (plotTree.x, plotTree.y), centerPt, leafNodeStyle)
plotMidText((plotTree.x, plotTree.y), centerPt, str(val))
# 移到下一个叶子节点
plotTree.x = plotTree.x + plotTree.xOff
# 递归完成后返回上一层
plotTree.y = plotTree.y + plotTree.yOff
def createPlot(decisionTree):
"""
:Author: WangBlue
:Create: 2022/8/15 15:14
:func:画出决策树
:param decisionTree: 决策树
:return: 无
"""
fig = plt.figure(1, facecolor="white")
fig.clf()
axprops = {"xticks": [], "yticks": []}
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
# 定义画图的图形尺寸
plotTree.figSize = 1.5
# 初始化树的总大小
plotTree.totalLeaf, plotTree.totalDepth, plotTree.leafDepth = getTreeSize(decisionTree)
# 叶子节点的初始位置x 和 根节点的初始层高度y
plotTree.x = 0
plotTree.y = plotTree.figSize
plotTree(decisionTree, (plotTree.figSize / 2.0, plotTree.y), "")
plt.show()
# 创建预剪枝决策树
def createTreePrePruning(dataTrain, labelTrain, dataTest, labelTest, names, method='ID3'):
"""
:Author: WangBlue
:Create: 2022/8/15 15:20
:func: 创建预剪枝决策树
:param dataTrain: 训练集
:param labelTrain: 训练标签
:param dataTest: 测试集
:param labelTest: 测试集标签
:param names: 特征名称
:param method: 使用ID3还是C45
:return: 预剪枝决策树
"""
dataTestSet = {}
labelTestSet = {}
labelTestRatioPre = 0.0
labelTestRatioPost = 0.0
trainData = np.asarray(dataTrain)
labelTrain = np.asarray(labelTrain)
testData = np.asarray(dataTest)
labelTest = np.asarray(labelTest)
names = np.asarray(names)
# 如果结果为单一结果
if len(set(labelTrain)) == 1:
return labelTrain[0]
# 如果没有待分类特征
elif trainData.size == 0:
return voteLabel(labelTrain)
# 其他情况则选取特征
bestFeat, bestEnt = bestFeature(dataTrain, labelTrain, method=method)
# 取特征名称
bestFeatName = names[bestFeat]
# 从特征名称列表删除已取得特征名称
names = np.delete(names, [bestFeat])
# 根据最优特征进行分割
dataTrainSet, labelTrainSet = splitFeatureData(dataTrain, labelTrain, bestFeat)
# 预剪枝评估
# 划分前的分类标签
labelTrainLabelPre = voteLabel(labelTrain)
labelTrainRatioPre = equalNums(labelTrain, labelTrainLabelPre) / labelTrain.size
# 划分后的精度计算
if dataTest is not None:
dataTestSet, labelTestSet = splitFeatureData(dataTest, labelTest, bestFeat)
# 划分前的测试标签正确比例
labelTestRatioPre = equalNums(labelTest, labelTrainLabelPre) / labelTest.size
# 划分后 每个特征值的分类标签正确的数量
labelTrainEqNumPost = 0
for val in labelTrainSet.keys():
labelTrainEqNumPost += equalNums(labelTestSet.get(val), voteLabel(labelTrainSet.get(val))) + 0.0
# 划分后 正确的比例
labelTestRatioPost = labelTrainEqNumPost / labelTest.size
# 如果没有评估数据 但划分前的精度等于最小值0.5 则继续划分
if dataTest is None and labelTrainRatioPre == 0.5:
decisionTree = {bestFeatName: {}}
for featValue in dataTrainSet.keys():
decisionTree[bestFeatName][featValue] = createTreePrePruning(dataTrainSet.get(featValue),
labelTrainSet.get(featValue)
, None, None, names, method)
elif dataTest is None:
return labelTrainLabelPre
# 如果划分后的精度相比划分前的精度下降, 则直接作为叶子节点返回
elif labelTestRatioPost < labelTestRatioPre:
return labelTrainLabelPre
else:
# 根据选取的特征名称创建树节点
decisionTree = {bestFeatName: {}}
# 对最优特征的每个特征值所分的数据子集进行计算
for featValue in dataTrainSet.keys():
decisionTree[bestFeatName][featValue] = createTreePrePruning(dataTrainSet.get(featValue),
labelTrainSet.get(featValue)
, dataTestSet.get(featValue),
labelTestSet.get(featValue)
, names, method)
return decisionTree
# 后剪枝
def createTreeWithLabel(data, labels, names, method='ID3'):
data = np.asarray(data)
labels = np.asarray(labels)
names = np.asarray(names)
# 如果不划分的标签为
votedLabel = voteLabel(labels)
# 如果结果为单一结果
if len(set(labels)) == 1:
return votedLabel
# 如果没有待分类特征
elif data.size == 0:
return votedLabel
# 其他情况则选取特征
bestFeat, bestEnt = bestFeature(data, labels, method=method)
# 取特征名称
bestFeatName = names[bestFeat]
# 从特征名称列表删除已取得特征名称
names = np.delete(names, [bestFeat])
# 根据选取的特征名称创建树节点 划分前的标签votedPreDivisionLabel=_vpdl
decisionTree = {bestFeatName: {"_vpdl": votedLabel}}
# 根据最优特征进行分割
dataSet, labelSet = splitFeatureData(data, labels, bestFeat)
# 对最优特征的每个特征值所分的数据子集进行计算
for featValue in dataSet.keys():
decisionTree[bestFeatName][featValue] = createTreeWithLabel(dataSet.get(featValue), labelSet.get(featValue),
names, method)
return decisionTree
# 将带预划分标签的tree转化为常规的tree
def convertTree(labeledTree):
labeledTreeNew = labeledTree.copy()
nodeName = list(labeledTree.keys())[0]
labeledTreeNew[nodeName] = labeledTree[nodeName].copy()
for val in list(labeledTree[nodeName].keys()):
if val == "_vpdl":
labeledTreeNew[nodeName].pop(val)
a = labeledTree[nodeName][val]
elif type(labeledTree[nodeName][val]) == dict:
labeledTreeNew[nodeName][val] = convertTree(labeledTree[nodeName][val])
return labeledTreeNew
# 后剪枝 训练完成后决策节点进行替换评估 这里可以直接对xgTreeTrain进行操作
def treePostPruning(labeledTree, dataTest, labelTest, names):
newTree = labeledTree.copy()
dataTest = np.asarray(dataTest)
labelTest = np.asarray(labelTest)
names = np.asarray(names)
# 取决策节点的名称 即特征的名称
featName = list(labeledTree.keys())[0]
# print("\n当前节点:" + featName)
# 取特征的列
featCol = np.argwhere(names == featName)[0][0]# [[4]]因为是二维矩阵,所以我们要取得零行零列
names = np.delete(names, [featCol])
# print("当前节点划分的数据维度:" + str(names))
# print("当前节点划分的数据:" )
# print(dataTest)
# print(labelTest)
# 该特征下所有值的字典
newTree[featName] = labeledTree[featName].copy()
featValueDict = newTree[featName]
featPreLabel = featValueDict.pop("_vpdl")
# print("当前节点预划分标签:" + featPreLabel)
# 是否为子树的标记
subTreeFlag = 0
dataTestSet = {}
labelTestSet = {}
# 分割测试数据 如果有数据 则进行测试或递归调用 np的array我不知道怎么判断是否None, 用is None是错的
dataFlag = 1 if sum(dataTest.shape) > 0 else 0
if dataFlag == 1:
# print("当前节点有划分数据!")
dataTestSet, labelTestSet = splitFeatureData(dataTest, labelTest, featCol)
for featValue in featValueDict.keys():
print(featValue)
# print("当前节点属性 {0} 的子节点:{1}".format(featValue ,str(featValueDict[featValue])))
if dataFlag == 1 and type(featValueDict[featValue]) == dict:
subTreeFlag = 1
# 如果是子树则递归
newTree[featName][featValue] = treePostPruning(featValueDict[featValue], dataTestSet.get(featValue),
labelTestSet.get(featValue), names)
print(dataTestSet.get(featValue))
# 如果递归后为叶子 则后续进行评估
if type(featValueDict[featValue]) != dict:
subTreeFlag = 0
# 如果没有数据 则转换子树
if dataFlag == 0 and type(featValueDict[featValue]) == dict:
subTreeFlag = 1
# print("当前节点无划分数据!直接转换树:"+str(featValueDict[featValue]))
newTree[featName][featValue] = convertTree(featValueDict[featValue])
# print("转换结果:" + str(convertTree(featValueDict[featValue])))
# 如果全为叶子节点, 评估需要划分前的标签,这里思考两种方法,
# 一是,不改变原来的训练函数,评估时使用训练数据对划分前的节点标签重新打标
# 二是,改进训练函数,在训练的同时为每个节点增加划分前的标签,这样可以保证评估时只使用测试数据,避免再次使用大量的训练数据
# 这里考虑第二种方法 写新的函数 createTreeWithLabel,当然也可以修改createTree来添加参数实现
if subTreeFlag == 0:
ratioPreDivision = equalNums(labelTest, featPreLabel) / labelTest.size
equalNum = 0
for val in labelTestSet.keys():
equalNum += equalNums(labelTestSet[val], featValueDict[val])
ratioAfterDivision = equalNum / labelTest.size
# print("当前节点预划分标签的准确率:" + str(ratioPreDivision))
# print("当前节点划分后的准确率:" + str(ratioAfterDivision))
# 如果划分后的测试数据准确率低于划分前的,则划分无效,进行剪枝,即使节点等于预划分标签
# 注意这里取的是小于,如果有需要 也可以取 小于等于
if ratioAfterDivision < ratioPreDivision:
newTree = featPreLabel
return newTree
if __name__ == '__main__':
# # 使用李航数据测试函数 p62
# lhData, lhLabel, lhName = createDataLH()
# print("书中H(D)为0.971,函数结果:" + str(round(infoEntropy(lhLabel), 3)))
# print("书中g(D, A1)为0.083,函数结果:" + str(round(infoGain(lhData[:, 0], lhLabel), 3)))
# print("书中g(D, A2)为0.324,函数结果:" + str(round(infoGain(lhData[:, 1], lhLabel), 3)))
# print("书中g(D, A3)为0.420,函数结果:" + str(round(infoGain(lhData[:, 2], lhLabel), 3)))
# print("书中g(D, A4)为0.363,函数结果:" + str(round(infoGain(lhData[:, 3], lhLabel), 3)))
# # 测试正常,与书中结果一致
#
# # 使用西瓜数据测试函数 p75-p77
# xgData, xgLabel, xgName = createDataXIGua()
# print("书中Ent(D)为0.998,函数结果:" + str(round(infoEntropy(xgLabel), 4)))
# print("书中Gain(D, 色泽)为0.109,函数结果:" + str(round(infoGain(xgData[:,0] ,xgLabel), 4)))
# print("书中Gain(D, 根蒂)为0.143,函数结果:" + str(round(infoGain(xgData[:,1] ,xgLabel), 4)))
# print("书中Gain(D, 敲声)为0.141,函数结果:" + str(round(infoGain(xgData[:,2] ,xgLabel), 4)))
# print("书中Gain(D, 纹理)为0.381,函数结果:" + str(round(infoGain(xgData[:,3] ,xgLabel), 4)))
# print("书中Gain(D, 脐部)为0.289,函数结果:" + str(round(infoGain(xgData[:,4] ,xgLabel), 4)))
# print("书中Gain(D, 触感)为0.006,函数结果:" + str(round(infoGain(xgData[:,5] ,xgLabel), 4)))
# 使用李航数据测试函数 p62
# lhData, lhLabel, lhName = createDataLH()
# lhTree = createTree(lhData, lhLabel, lhName, method="ID3")
# print(lhTree)
# createPlot(lhTree)
# 将西瓜数据2.0分割为测试集和训练集
# xgData, xgLabel, xgName = createDataXIGua()
# xgDataTrain, xgLabelTrain, xgDataTest, xgLabelTest = splitXgData20(xgData, xgLabel)
# # 生成不剪枝的树
# xgTreeTrain = createTree(xgDataTrain, xgLabelTrain, xgName, method='ID3')
# # 生成预剪枝的树
# xgTreePrePruning = createTreePrePruning(xgDataTrain, xgLabelTrain, xgDataTest, xgLabelTest, xgName, method='ID3')
# # 画剪枝前的树
# print("剪枝前的树")
# createPlot(xgTreeTrain)
# # 画剪枝后的树
# print("剪枝后的树")
# createPlot(xgTreePrePruning)
# 创建决策树 带预划分标签
# 书中的树结构 p81 p83
xgData, xgLabel, columnName = createDataXIGua()
ID3 = createTree(xgData,xgLabel, columnName,method="C45")
print(ID3)
# xgDataTrain, xgLabelTrain, xgDataTest, xgLabelTest = splitXgData20(xgData, xgLabel)
# xgTreeBeforePostPruning = {"脐部": {"_vpdl": "是"
# , '凹陷': {'色泽': {"_vpdl": "是", '青绿': '是', '乌黑': '是', '浅白': '否'}}
# , '稍凹': {'根蒂': {"_vpdl": "是"
# , '稍蜷': {'色泽': {"_vpdl": "是"
# , '青绿': '是'
# , '乌黑': {'纹理': {"_vpdl": "是"
# , '稍糊': '是', '清晰': '否', '模糊': '是'}}
# , '浅白': '是'}}
# , '蜷缩': '否'
# , '硬挺': '是'}}
# , '平坦': '否'}}
# xgTreePostPruning = treePostPruning(xgTreeBeforePostPruning, xgDataTest, xgLabelTest, columnName)
# createPlot(convertTree(xgTreeBeforePostPruning))
# createPlot(xgTreePostPruning)
参考文献:
https://blog.csdn.net/ylhlly/article/details/93213633
(137条消息) 决策树的预剪枝与后剪枝_zfan520的博客-CSDN博客_预剪枝和后剪枝
西瓜书:周志华
机器学习实战