《机器学习实战》源码解析（二）：决策树

最新推荐文章于 2021-05-29 14:34:25 发布

qq_45393426

最新推荐文章于 2021-05-29 14:34:25 发布

阅读量230

点赞数 1

文章标签：机器学习

本文链接：https://blog.csdn.net/qq_45393426/article/details/106149571

版权

本系列以书中源码为主，稍作修改并添加注释，均实际运行可行。为免后来者踩坑，特此公开！欢迎打赏！
转载请注明出处！

from math import log
import operator
import matplotlib.pyplot as plt

def calcShannonEnt(dataSet): #定义计算信息熵函数
    numEntries = len(dataSet) #获得数据集长度
    labelCounts ={} #创建符号类别字典
    for featVec in dataSet: #对不同符号计数
        currentLabel = featVec[-1] #获取每条数据集符号类别
        if currentLabel not in labelCounts.keys(): #初始化符号个数为0
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1 #对当前符号计数加1
    shannonEnt = 0.0 #初始化香农熵为0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries #对字典中的不同符号计算其出现概率
        shannonEnt -= prob * log(prob,2) #计算香农熵
    return shannonEnt

def createDataSet():
    dataSet =[[1,1,'yes'],
              [1,1,'yes'],
              [0,1,'no'],
              [0,1,'no'] ]
    labels =['no surfacing','flippers']
    return dataSet,labels

myDat,labels=createDataSet()
calcShannonEnt(myDat)

def splitDataSet(dataSet,axis,value):#按照给定特征划分数据集,axis为特征，value为特征对应的特征值
    retDataSet =[] #声明新的列表对象存储修改后的数据集
    for featVec in dataSet: #遍历原数据集每行
        if featVec[axis] == value: #选取满足对应特征的特征值要求的行数据
            reducedFeatVec = featVec[:axis] #抽离该行特征
            reducedFeatVec .extend(featVec[axis+1:])#同上
            retDataSet.append(reducedFeatVec)#复制到新的列表中
    return retDataSet

def chooseBestFeatureToSplit(dataSet):#根据ID3算法选择最佳特征进行分割
    numFeatures = len(dataSet[0]) - 1 #计算一共有多少个特征
    baseEntropy = calcShannonEnt(dataSet)#计算总体样本集的信息熵
    bestInfoGain = 0.0;bestFeature = -1#初始化最佳信息增益，最佳划分特征
    for i in range(numFeatures): #迭代所有的特征，在一次for循环之内，遍历一个特征所对应的值，并统计每个特征值对应样本的输量
        featList = [example[i] for example in dataSet] #将第i个特征的所有取值提取出来
        uniqueVals = set(featList) #消除重复元素
        newEntropy = 0.0 #初始化新的信息熵
        for value in uniqueVals: #计算第i个特征的信息熵
            subDataSet = splitDataSet(dataSet,i,value) #重新划分数据集
            prob = len(subDataSet) /float(len(dataSet)) #极大似然估计概率
            newEntropy += prob * calcShannonEnt(subDataSet) #累加第i个特征取不同值的信息熵
        infoGain = baseEntropy - newEntropy #计算信息增益
        if(infoGain > bestInfoGain): #迭代取得最大信息增益及对应特征
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList): #对类标签进行投票，找标签数目最多的标签
    classCount={} #定义标签元字典，key为标签，value为标签的数目
    for vote in classList: #遍历所有标签，并计数
        if vote not in classCount.keys():classCount[vote] = 0
        classCount[vote] += 1
    sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0] #排序后返回数目最多的标签，key=operator.itemgetter(1)根据第一个参数排序

def createTree(dataSet,labels): #创建决策树
    classList = [example[-1] for example in dataSet] #提取各行数据的类别
    if classList.count(classList[0]) == len(classList):#判断所有数据是否属于同一标签，是则返回
        return classList[0]
    if len(dataSet[0]) == 1: #判断特征是否只有一个，若是则对该集合进行投票选择最大类
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)#提取增益熵最大的特征所在的下标
    bestFeatLabel = labels[bestFeat]#提取该特征
    myTree = {bestFeatLabel:{}} #建立树的字典结构
    del(labels[bestFeat])#去除已经选择的特征，以便下次递归
    featValues = [example[bestFeat] for example in dataSet]#提取增益熵最大的特征所有值
    uniqueVals = set(featValues)#去除重复的特征值
    for value in uniqueVals:#遍历各值，建立递归树
        subLabels = labels[:]#更新labels
        myTree[bestFeatLabel][value]= createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree

'''
#使用文本注解绘制树节点
decisionNode = dict(boxstyle="sawtooth",fc="0.8") #boxstyle为文本框的类型，sawtooth是锯齿形，fc是边框线粗细
leafNode = dict(boxstyle="round4",fc="0.8")#定义决策树的叶子节点的描述属性
arrow_args = dict(arrowstyle="<-")#定义决策树的箭头属性

def plotNode(nodeTxt,centerPt,parentPt,nodeType):#绘制节点
    createPlot.axl.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\
    xytext=centerPt,textcoords='axes fraction',va="center",ha="center",bbox=nodeType,arrowprops=arrow_args)
    
def createPlot():
    fig =plt.figure(1,figsize=(12,9),facecolor='white')
    fig.clf()
    createPlot.axl = plt.subplot(111,frameon=False)
    plotNode('决策节点',(0.5,0.1),(0.1,0.5),decisionNode)
    plotNode('叶节点',(0.8,0.1),(0.3,0.8),leafNode)
    plt.show()
    
createPlot()
'''

#获取叶节点的数目和树的层数
def getNumLeafs(myTree):
    numLeafs = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
            numLeafs += getNumLeafs(secondDict[key])
        else:   numLeafs +=1
    return numLeafs

def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:   thisDepth = 1
        if thisDepth > maxDepth: maxDepth = thisDepth
    return maxDepth

#更新绘制树形图函数
decisionNode = dict(boxstyle="sawtooth",fc="0.8") #boxstyle为文本框的类型，sawtooth是锯齿形，fc是边框线粗细
leafNode = dict(boxstyle="round4",fc="0.8")#定义决策树的叶子节点的描述属性
arrow_args = dict(arrowstyle="<-")#定义决策树的箭头属性

def plotNode(nodeTxt,centerPt,parentPt,nodeType):#绘制节点
    createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\
    xytext=centerPt,textcoords='axes fraction',va="center",ha="center",bbox=nodeType,arrowprops=arrow_args)

def plotMidText(cntrPt, parentPt, txtString):
    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)

def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
    numLeafs = getNumLeafs(myTree)  #this determines the x width of this tree
    depth = getTreeDepth(myTree)
    firstStr = list(myTree.keys())[0]     #the text label for this node should be this
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes   
            plotTree(secondDict[key],cntrPt,str(key))        #recursion
        else:   #it's a leaf node print the leaf node
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#if you do get a dictonary you know it's a tree, and the first element will be another dict

def createPlot(inTree):
    fig = plt.figure(1,figsize=(8,8) ,facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)    #no ticks
    #createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses 
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
    plotTree(inTree, (0.5,1.0), '')
    plt.show()

#创建树结构测试上2函数
def retrieveTree(i):
    listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
                  {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
                  ]
    return listOfTrees[i]
myTree=retrieveTree(0)
myTree['no surfacing'][3]='maybe'

#使用决策树的分类函数
def classify(inputTree,featLabels,testVec):#参数列表：决策树模型，特征向量，测试向量
    firstStr = inputTree.keys()[0] #获取决策树第一个特征
    secondDict = inputTree[firstStr] #获取字典（决策树）的第一个值，即另一个字典（决策树）
    featIndex = featLabels.index(firstStr)#获取决策树第一个特征在特征列表中的索引
    key = testVec[featIndex]#获取该特征在测试数据中对应的值
    valueOfFeat = secondDict[key]#根据测试数据的特征值返回其在子决策树对应的值，也可能是另一棵子决策树
    if isinstance(valueOfFeat, dict): #判断返回值是否为字典（决策树)
        classLabel = classify(valueOfFeat, featLabels, testVec)#递归迭代直到做出决策为止
    else: classLabel = valueOfFeat
    return classLabel

#使用pickle模块存储决策树
def storeTree(inputTree,filename):
    import pickle
    fw = open(filename,'w')
    pickle.dump(inputTree,fw)
    fw.close()
    
def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)

#实例，使用决策树预测隐形眼镜类型
fr=open(r'选择你自己的数据集存储路径\lenses.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]#清洗数据，去除首尾空格，以tab键对数据切片并返回列表
lensesLabels=['age','prescript','astigmatic','tearRate']
lensesTree=createTree(lenses,lensesLabels)
createPlot(lensesTree)

qq_45393426

关注

1
点赞
踩
3

收藏

觉得还不错? 一键收藏
1
评论
《机器学习实战》源码解析（二）：决策树

本文以书中源码为主，稍作修改并添加注释，均实际运行可行。为免后来者踩坑，特此公开！欢迎打赏！转载请注明出处！from math import logimport operatorimport matplotlib.pyplot as pltdef calcShannonEnt(dataSet): #定义计算信息熵函数 numEntries = len(dataSet) #获得数据集长度 labelCounts ={} #创建符号类别字典 for featVec in da
复制链接

扫一扫