决策数—(ID3和C4.5)算法实现

是忘生啊

于 2022-01-20 00:15:00 发布

阅读量718

点赞数

分类专栏：机器学习文章标签：算法决策树机器学习

本文链接：https://blog.csdn.net/m0_51456926/article/details/122550071

版权

机器学习专栏收录该内容

26 篇文章 0 订阅

订阅专栏

1.ID3算法

在这里插入图片描述
我们要对这样一组数据来构建一棵决策树来预测在{天气=晴，温度=适中，湿度=正常，风速=弱}的情况下，是否适合活动。

信息熵为： $Ent('活动')=-{\frac{9}{14}log_{2}\frac{9}{14}}-\frac{5}{14}log_{2}\frac{5}{14}=0.9402859586706309$
信息增益的计算：
$Info_{'天气'}('活动')=\frac{5}{14}info_{'晴'}('活动')+\frac{4}{14}info_{'雨'}('活动')+\frac{5}{14}info_{'阴'}('活动')=\frac{5}{14}(-\frac{3}{5}log_{2}\frac{3}{5}-\frac{2}{5}log_{2}\frac{2}{5})+\frac{4}{14}(-\frac{4}{4}log_{2}\frac{4}{4}-\frac{0}{0}log_{2}\frac{0}{0})+\frac{5}{14}(-\frac{2}{5}log_{2}\frac{3}{5}-\frac{3}{5}log_{2}\frac{3}{5})=0.6935361388961918$
信息增益为：Gain(‘天气’)=Ent(‘活动’)- $infoInfo_{'天气'}('活动')=0.2467498197744391$
同理可以算出剩余三项的条件熵和信息增益分别为

$info_{A}(D)$	Gain(A)
0.9110633930116763	0.029222565658954647
0.7884504573082896	0.15183550136234136
0.8921589282623617	0.04812703040826927

由此可以看出天气的信息增益最大，所以选择天气作为最优解，接下来数据集根据天气的不同被划分了三个部分：
在这里插入图片描述

紧接着对每个子数据集重复上面的步骤，构造出对应的子决策树。

from math import log
import operator
import pickle

dataSet = [['sunny','hot','high','weak','no'],
           ['sunny','hot','high','strong','no'],
            ['overcast','hot','high','weak','yes'],
            ['rain','mild','high','weak','yes'],
            ['rain','cool','normal','weak','yes'],
            ['rain','cool','normal','strong','no'],
            ['overcast','cool','normal','strong','yes'],
            ['sunny','mild','high','weak','no'],
            ['sunny','cool','normal','weak','yes'],
            ['rain','mild','normal','weak','yes'],
            ['sunny','mild','normal','strong','yes'],
            ['overcast','mild','high','strong','yes'],
            ['overcast','hot','normal','weak','yes'],
            ['rain','mild','high','strong','no']]
labels = ['outlook','temperature','humidity','wind']

featlist=[number[0] for number in dataSet]
uniquelVals=set(featlist)
print(uniquelVals)

{'overcast', 'rain', 'sunny'}

def calcShanonEnt(dataset):#计算信息熵
    numSamples=len(dataset)
    labelCounts={}
    for fearture in dataset:#统计最后一列各标签的数目
        currentLabel=fearture[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1
    #print(labelCounts)
    entropy=0
    #计算信息熵
    for key in labelCounts:
        p=float(labelCounts[key])/numSamples
        entropy-=p*log(p,2)
    return entropy
calcShanonEnt(dataSet)

0.9402859586706309

def splitDataSet(dataSet,axis,value):
    retDataSet=[]
    for feat in dataSet:
        if feat[axis]==value:
            reduceFeat=feat[:axis]
            reduceFeat.extend(feat[axis+1:])
            retDataSet.append(reduceFeat)
    return retDataSet #返回不含划分特征的子集

splitDataSet(dataSet,0,'rain')

[['mild', 'high', 'weak', 'yes'],
 ['cool', 'normal', 'weak', 'yes'],
 ['cool', 'normal', 'strong', 'no'],
 ['mild', 'normal', 'weak', 'yes'],
 ['mild', 'high', 'strong', 'no']]

def chooseBestFeatureToSplit(dataSet):
    numberFeature=len(dataSet[0])-1#不选择标签列
    baseEntropy=calcShanonEnt(dataSet)#计算最后一列信息熵
    bestInforGain=0
    bestFeature=-1

    for i in range(numberFeature):
        featlist=[number[i] for number in dataSet]
        uniquelVals=set(featlist)
        #set 得到无重复的属性 {'rain', 'sunny', 'overcast'}

        newEntropy=0
        for vals in uniquelVals:
            subDataSet=splitDataSet(dataSet,i,vals)
            prob=len(subDataSet)/float(len(dataSet))
            newEntropy+=prob*calcShanonEnt(subDataSet)#计算条件增益
        inforGain=baseEntropy-newEntropy
        
        if(inforGain>bestInforGain):
            bestInforGain=inforGain
            bestFeature=i
    return bestFeature

#创建递归树，用于找出出现次数最多的分类名称
def majorityCnt(classList):
    classCount={}
    for vote in classList:#统计各特征值出现的次数
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1
   #对特征值出现的次数进行排序 
sortedClasscount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClasscount[0][0]

def creatTree(dataSet,labels):
    #存放最后一列
    classList=[example[-1] for example in dataSet]
    
    #count 统计某个元素出行的次数
   
    if classList.count(classList[0])==len(classList):
        return classList[-1]
    #当class里面全是一类是停止分割(即全为yes或者no的时候)

    if len(dataSet[0])==1: #当没有更多特征时停止分割，即分到最后一个特征也没有把数据完全分开，就返回多数的那个结果
        return majorityCnt(classList)
    
    #选出最优特征
    bestFeat=chooseBestFeatureToSplit(dataSet)
    bestFeatLable=labels[bestFeat]
    #从标签中取出最优特征对应的标签

    myTree={bestFeatLable:{}}
    del (labels[bestFeat])#删除已经找到的特征标签

    featValues=[example[bestFeat] for example in dataSet]
    #对特征标签每个值对应的样本进行划分
    uniquelVals=set(featValues)
    for value in uniquelVals:
        subLables=labels[:]
       #对每个划分出的子样本进行决策树构建
    subDataSet=splitDataSet(dataSet,bestFeat,value)
        myTree[bestFeatLable][value]=creatTree(subDataSet,subLables)
    return myTree

mytree=creatTree(dataSet,labels)
mytree

{'outlook': {'overcast': 'yes',
  'rain': {'wind': {'strong': 'no', 'weak': 'yes'}},
  'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}

labels = ['outlook','temperature','humidity','wind']
featlabels=labels[:]
testFeatValue=['rain','hot','high','weak']
def classify(tree,featlabels,testFeatValue):
    firstStr=list(tree.keys())[0]#outlook
    secondDict=tree[firstStr]#{'rain': {'wind': {'strong': 'no', 'weak': 'yes'}},
    # 'sunny': {'humidity': {'normal': 'yes', 'high': 'no'}}, 'overcast': 'yes'}
    featIndex=featlabels.index(firstStr)# 0 找出第一个标签的索引的位置
    # print(firstStr)
    # print(secondDict)
    # print(featIndex)
    for firstStr_value in secondDict.keys():
        if(testFeatValue[featIndex]==firstStr_value):
            #如果找到了对应分支，如果还有子树，则继续递归寻找
            if(type(secondDict[firstStr_value]).__name__=='dict'):##这里是两个 _
                classLabel=classify(secondDict[firstStr_value],featlabels,testFeatValue)
            else:
                classLabel=secondDict[firstStr_value]
    return classLabel

classify=classify(mytree,featlabels,testFeatValue)
print(classify)

yes

tinydict = {'Name': 'Zara', 'Age': 7}
tinydict.keys()
type(tinydict).__name__

'dict'

C4.5

C4.5是ID3算法的一个改进，他是基于增益率来选择最优特征的，而计算增益率则要在ID3的基础上求解特征熵，所以我们的C4.5 仅需在ID3的求信息熵函数和选择最优特征的函数上做些修改即可。

import operator
from math import log

dataSet = [['sunny','hot','high','weak','no'],
           ['sunny','hot','high','strong','no'],
            ['overcast','hot','high','weak','yes'],
            ['rain','mild','high','weak','yes'],
            ['rain','cool','normal','weak','yes'],
            ['rain','cool','normal','strong','no'],
            ['overcast','cool','normal','strong','yes'],
            ['sunny','mild','high','weak','no'],
            ['sunny','cool','normal','weak','yes'],
            ['rain','mild','normal','weak','yes'],
            ['sunny','mild','normal','strong','yes'],
            ['overcast','mild','high','strong','yes'],
            ['overcast','hot','normal','weak','yes'],
            ['rain','mild','high','strong','no']]
labels = ['outlook','temperature','humidity','wind']

classList=[example[-1] for example in dataSet]
print(dataSet[0])
len(dataSet[0])

['sunny', 'hot', 'high', 'weak', 'no']





5

#不仅可以求解信息熵，还可以求解每个特征的特征熵
def clacShanonEntFeature(dataSet,i):
    numbers=len(dataSet)
    labelCounts={}
    for feature in dataSet:
        currentlabel=feature[i]
        if currentlabel not in labelCounts.keys():
            labelCounts[currentlabel]=0
        labelCounts[currentlabel]+=1
    #统统不同特征取值对应的个数
    entropy=0
    for key in labelCounts:
        p=float(labelCounts[key])/numbers
        entropy-=p*log(p,2)
    return entropy
clacShanonEntFeature(dataSet,0)

1.5774062828523452

def splitDataSet(dataSet,axis,value):
    Redataset=[]
    for feature in dataSet:
        if(feature[axis]==value):
            reduceset=feature[:axis]
            reduceset.extend(feature[axis+1:])
            Redataset.append(reduceset)
    return Redataset
splitDataSet(dataSet[:],0,'rain')
#分割不含最优特征的子数据集

[['mild', 'high', 'weak', 'yes'],
 ['cool', 'normal', 'weak', 'yes'],
 ['cool', 'normal', 'strong', 'no'],
 ['mild', 'normal', 'weak', 'yes'],
 ['mild', 'high', 'strong', 'no']]

def chooseBestFeature(Dataset):
    baseEntropy=clacShanonEntFeature(dataSet,-1)
    bestFeature=-1
    bestGrainrate=0
    numbersFeature=len(Dataset[0])-1
    for i in range(numbersFeature):
        featurelist=[example[i] for example in dataSet]
        uniquefeature=set(featurelist)
        newEntropy=0
        for uniquevalue in uniquefeature:
            #求各个特征条件熵
            subDataSet=splitDataSet(dataSet,i,uniquevalue)
            p=float(len(subDataSet))/len(Dataset)
            newEntropy+=p*clacShanonEntFeature(subDataSet,-1)

        infroGain=baseEntropy-newEntropy#求增益

        splitInfo=clacShanonEntFeature(dataSet,i)
        if(splitInfo==0):#如果分母为0跳过
            continue
        else:
            Grainrate=infroGain/splitInfo#求解增益率
        
        if(Grainrate>bestGrainrate):
            bestGrainrate=Grainrate
            bestFeature=i
    return bestFeature
#dataSet
chooseBestFeature(dataSet)

def majorityCnt(classList):
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        else:
            classCount[vote]+=1
    sortedClasscount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClasscount[0][0]

def creatTree(dataSet,labels):
    classList=[example[-1] for example in dataSet]

    if(classList.count(classList[0])==len(classList)):
        return classList[0]
    #如果样本输出类别为同一类Di，直接返回Di
    if(len(dataSet[0])==1):
        return majorityCnt(classList)
    #如果样本特征为空，则返回类别为样本中输出类别D实列数最多的类别
    bestFeature=chooseBestFeature(dataSet)#选出最优特征
    bestLabel=labels[bestFeature]
    mytree={bestLabel:{}}

    del(labels[bestFeature])#删除一选出的特征标签

    featurevalues=[example[bestFeature] for example in dataSet]
    uniquevalues=set(featurevalues)#求出每个特征可以取得值
    for values in uniquevalues:
        sublabels=labels[:]#对每个子数据集进行递归求决策树
        subDataset=splitDataSet(dataSet,bestFeature,values)
        mytree[bestLabel][values]=creatTree(subDataset,sublabels)
    return mytree

creatTree(dataSet,labels)

{'outlook': {'rain': {'wind': {'no': 'no', 'yes': 'yes'}},
  'sunny': {'wind': {'no': 'no', 'yes': 'yes'}},
  'overcast': 'yes'}}

参考博客：
决策树之python实现ID3算法
 决策树之python实现C4.5算法

是忘生啊

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
决策数—(ID3和C4.5)算法实现

1.ID3算法我们要对这样一组数据来构建一棵决策树来预测在{天气=晴，温度=适中，湿度=正常，风速=弱}的情况下，是否适合活动。信息熵为：Ent(′活动′)=−914log2914−514log2514=0.9402859586706309Ent('活动')=-{\frac{9}{14}log_{2}\frac{9}{14}}-\frac{5}{14}log_{2}\frac{5}{14}=0.9402859586706309Ent(′活动′)=−149log2149−145log2145
复制链接

扫一扫

专栏目录