python实现ID3决策树和CART决策树

代码实现的数据模型如下所示:
在这里插入图片描述
以下是通过python实现的ID3决策树代码:

from math import log
import operator
dataSet=[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]  #数据集合
labels=['no surfacing','flippers']    #数据标签
def calcShannonEnt(dataSet):    #计算信息熵
    numEntries=len(dataSet)
    labelCounts={}
    for featVec in dataSet:
        currentLabel=featVec[-1]   #获取每个数据集的正反例情况
        if currentLabel not in labelCounts.keys():    #对正反例出现次数分别进行统计
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1
    shannonEnt=0.0
    for key in labelCounts:
        prob=float(labelCounts[key])/numEntries   #计算正例、反例在总样本数量中出现的概率
        shannonEnt-=prob*log(prob,2)
    return shannonEnt

def splitDataSet(dataSet,axis,value):   #划分数据集
    retDataSet=[]
    for featVec in dataSet:   #获取除了当前属性取值以外的其他属性的取值,以便再次进行泛化
        if featVec[axis]==value:
            reducedFeatVec=featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

def chooseBestFeatureToSplit(dataSet):  #计算每种属性的信息熵,选取最大信息熵作为当前决策树的根节点
    numFeatures=len(dataSet[0])-1
    baseEntropy=calcShannonEnt(dataSet)
    bestInfoGain=0.0
    bestFeature=-1
    for i in range(numFeatures):
        featureList=[example[i] for example in dataSet]
        uniqueVals=set(featureList)
        newEntropy=0.0
        for value in uniqueVals:
            subDataSet=splitDataSet(dataSet,i,value)
            prob=len(subDataSet)/float(len(dataSet))
            newEntropy+=prob*calcShannonEnt(subDataSet)
        infoGain=baseEntropy-newEntropy
        if infoGain>bestInfoGain:
            bestInfoGain=infoGain
            bestFeature=i
    return bestFeature

def majorityCnt(classList): #若叶子节点出现多种取值,则利用投票表决的方式选取叶子节点最终的取值
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():  #统计属性每种取值出现的次数
            classCount[vote]=0
        classCount[vote]+=1
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)  #逆序排序
    return sortedClassCount[0][0]

def creatTree(dataSet,labels):   #将以上方法组合,构建决策树模型
    classList=[example[-1] for example in dataSet]
    if classList.count(classList[0])==len(classList):
        return classList[0]
    if len(dataSet[0])==1:
        return majorityCnt(classList)
    bestFeat=chooseBestFeatureToSplit(dataSet)
    bestFeatLabel=labels[bestFeat]
    myTree={bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues=[example[bestFeat] for example in dataSet]
    uniqueVals=set(featValues)
    for value in uniqueVals:
        subLabels=labels[:]
        myTree[bestFeatLabel][value]=creatTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree

if __name__=='__main__':
    print(creatTree(dataSet,labels))

结果如下所示:
在这里插入图片描述
以下是通过python实现的CART决策树代码:

import operator
dataSet=[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]  #数据集合
labels=['no surfacing','flippers']    #数据标签
def calcGini(dataSet,i):    #计算基尼指数
    numEntries=len(dataSet)
    labelCounts={}
    for featVec in dataSet:
        currentLabel=featVec[i]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1
    Gini=1.0
    for key in labelCounts:
        prob=float(labelCounts[key])/numEntries
        Gini-=pow(prob,2)
    return Gini

def splitDataSet(dataSet,axis,value):   #划分数据集
    retDataSet=[]
    for featVec in dataSet:   #获取除了当前属性取值以外的其他属性的取值,以便再次进行泛化
        if featVec[axis]==value:
            reducedFeatVec=featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

def chooseBestFeatureToSplit(dataSet):  #计算每种属性的基尼指数,选取最小的基尼指数作为当前决策树的根节点
    numFeatures=len(dataSet[0])-1
    bestGini=float('inf')
    bestFeature=-1
    for i in range(numFeatures):
        featureList=[example[i] for example in dataSet]
        uniqueVals=set(featureList)
        newEntropy=0.0
        for value in uniqueVals:
            subDataSet=splitDataSet(dataSet,i,value)
            prob=len(subDataSet)/float(len(dataSet))
            newEntropy+=prob*calcGini(subDataSet,i)
        if newEntropy<bestGini:
            bestGini=newEntropy
            bestFeature=i
    return bestFeature

def majorityCnt(classList): #若叶子节点出现多种取值,则利用投票表决的方式选取叶子节点最终的取值
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():  #统计属性每种取值出现的次数
            classCount[vote]=0
        classCount[vote]+=1
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)  #逆序排序
    return sortedClassCount[0][0]

def creatTree(dataSet,labels):   #将以上方法组合,构建决策树模型
    classList=[example[-1] for example in dataSet]
    if classList.count(classList[0])==len(classList):
        return classList[0]
    if len(dataSet[0])==1:
        return majorityCnt(classList)
    bestFeat=chooseBestFeatureToSplit(dataSet)
    bestFeatLabel=labels[bestFeat]
    myTree={bestFeatLabel:{}}
    del(labels[bestFeat])
    featValues=[example[bestFeat] for example in dataSet]
    uniqueVals=set(featValues)
    for value in uniqueVals:
        subLabels=labels[:]
        myTree[bestFeatLabel][value]=creatTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree

if __name__=='__main__':
    print(creatTree(dataSet,labels))

运行结果如下:
在这里插入图片描述

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值