【机器学习】 - 决策树(西瓜数据集)

机器学习 专栏收录该内容
2 篇文章 0 订阅

周志华的西瓜书《决策树》部分的代码实现

#利用决策树算法,对mnist数据集进行测试
import numpy as np

#计算熵
def calcEntropy(dataSet):
    mD = len(dataSet)
    dataLabelList = [x[-1] for x in dataSet]
    dataLabelSet = set(dataLabelList)
    ent = 0
    for label in dataLabelSet:
        mDv = dataLabelList.count(label)
        prop = float(mDv) / mD
        ent = ent - prop * np.math.log(prop, 2)

    return ent

# # 拆分数据集
# # index - 要拆分的特征的下标
# # feature - 要拆分的特征
# # 返回值 - dataSet中index所在特征为feature,且去掉index一列的集合
def splitDataSet(dataSet, index, feature):
    splitedDataSet = []
    mD = len(dataSet)
    for data in dataSet:
        if(data[index] == feature):
            sliceTmp = data[:index]
            sliceTmp.extend(data[index + 1:])
            splitedDataSet.append(sliceTmp)
    return splitedDataSet

#根据信息增益 - 选择最好的特征
# 返回值 - 最好的特征的下标
def chooseBestFeature(dataSet):
    entD = calcEntropy(dataSet)
    mD = len(dataSet)
    featureNumber = len(dataSet[0]) - 1
    maxGain = -100
    maxIndex = -1
    for i in range(featureNumber):
        entDCopy = entD
        featureI = [x[i] for x in dataSet]
        featureSet = set(featureI)
        for feature in featureSet:
            splitedDataSet = splitDataSet(dataSet, i, feature)  # 拆分数据集
            mDv = len(splitedDataSet)
            entDCopy = entDCopy - float(mDv) / mD * calcEntropy(splitedDataSet)
        if(maxIndex == -1):
            maxGain = entDCopy
            maxIndex = i
        elif(maxGain < entDCopy):
            maxGain = entDCopy
            maxIndex = i

    return maxIndex

# 寻找最多的,作为标签
def mainLabel(labelList):
    labelRec = labelList[0]
    maxLabelCount = -1
    labelSet = set(labelList)
    for label in labelSet:
        if(labelList.count(label) > maxLabelCount):
            maxLabelCount = labelList.count(label)
            labelRec = label
    return labelRec

#生成树
def createDecisionTree(dataSet, featureNames):
    labelList = [x[-1] for x in dataSet]
    if(len(dataSet[0]) == 1): #没有可划分的属性了
        return mainLabel(labelList)  #选出最多的label作为该数据集的标签
    elif(labelList.count(labelList[0]) == len(labelList)): # 全部都属于同一个Label
        return labelList[0]

    bestFeatureIndex = chooseBestFeature(dataSet)
    bestFeatureName = featureNames.pop(bestFeatureIndex)
    myTree = {bestFeatureName: {}}
    featureList = [x[bestFeatureIndex] for x in dataSet]
    featureSet = set(featureList)
    for feature in featureSet:
        featureNamesNext = featureNames[:]
        splitedDataSet = splitDataSet(dataSet, bestFeatureIndex, feature)
        myTree[bestFeatureName][feature] = createDecisionTree(splitedDataSet, featureNamesNext)
    return myTree

#读取西瓜数据集2.0
def readWatermelonDataSet():
    ifile = open("周志华_西瓜数据集2.txt")
    featureName = ifile.readline()  #表头
    labels = (featureName.split(' ')[0]).split(',')
    lines = ifile.readlines()
    dataSet = []
    for line in lines:
        tmp = line.split('\n')[0]
        tmp = tmp.split(',')
        dataSet.append(tmp)

    return dataSet, labels

def main():
    #读取数据
    dataSet, featureNames = readWatermelonDataSet()
    print(createDecisionTree(dataSet, featureNames))

if __name__ == "__main__":
    main()

最后输出的决策树是:
{‘纹理’: {‘模糊’: ‘否’, ‘清晰’: {‘根蒂’: {‘稍蜷’: {‘色泽’: {‘乌黑’: {‘触感’: {‘硬滑’: ‘是’, ‘软粘’: ‘否’}}, ‘青绿’: ‘是’}}, ‘蜷缩’: ‘是’, ‘硬挺’: ‘否’}}, ‘稍糊’: {‘触感’: {‘硬滑’: ‘否’, ‘软粘’: ‘是’}}}}

画出来是这个样子的:
在这里插入图片描述

这个地方和书上不太一样。
后来参考了一篇CSDN文章
说是需要补全决策树
在这里插入图片描述
后来又仔细看了伪代码
在这里插入图片描述
主要是对画红线处的理解。
这里的“每一个值”到底是原始数据集的?还是分割后的数据集的
上面的代码是后者,书上是前者

把createDecisionTree() 和 readWatermelonDataSet()函数修改为下面的:

#生成决策树
# featureNamesSet 是featureNames取值的集合
# labelListParent 是父节点的标签列表
def createFullDecisionTree(dataSet, featureNames, featureNamesSet, labelListParent):
    labelList = [x[-1] for x in dataSet]
    if(len(dataSet) == 0):
        return mainLabel(labelListParent)
    elif(len(dataSet[0]) == 1): #没有可划分的属性了
        return mainLabel(labelList)  #选出最多的label作为该数据集的标签
    elif(labelList.count(labelList[0]) == len(labelList)): # 全部都属于同一个Label
        return labelList[0]

    bestFeatureIndex = chooseBestFeature(dataSet)
    bestFeatureName = featureNames.pop(bestFeatureIndex)
    myTree = {bestFeatureName: {}}
    featureList = featureNamesSet.pop(bestFeatureIndex)
    featureSet = set(featureList)
    for feature in featureSet:
        featureNamesNext = featureNames[:]
        featureNamesSetNext = featureNamesSet[:][:]
        splitedDataSet = splitDataSet(dataSet, bestFeatureIndex, feature)
        myTree[bestFeatureName][feature] = createFullDecisionTree(splitedDataSet, featureNamesNext, featureNamesSetNext, labelList)
    return myTree


#读取西瓜数据集2.0
def readWatermelonDataSet():
    ifile = open("周志华_西瓜数据集2.txt")
    featureName = ifile.readline()  #表头
    featureNames = (featureName.split(' ')[0]).split(',')
    lines = ifile.readlines()
    dataSet = []
    for line in lines:
        tmp = line.split('\n')[0]
        tmp = tmp.split(',')
        dataSet.append(tmp)

    #获取featureNamesSet
    featureNamesSet = []
    for i in range(len(dataSet[0]) - 1):
        col = [x[i] for x in dataSet]
        colSet = set(col)
        featureNamesSet.append(list(colSet))

    return dataSet, featureNames, featureNamesSet

现在和书上的一样了
在这里插入图片描述

  • 9
    点赞
  • 8
    评论
  • 47
    收藏
  • 打赏
    打赏
  • 扫一扫,分享海报

评论8
请先登录 后发表评论~
©️2021 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页

打赏作者

ylemfei

你的鼓励将是我创作的最大动力

¥2 ¥4 ¥6 ¥10 ¥20
输入1-500的整数
余额支付 (余额:-- )
扫码支付
扫码支付:¥2
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值