数据挖掘 使用ID3算法构建决策树

使用ID3算法构建一个决策树

from math import log
import operator

def calShang(dataSet):    # 计算数据集的熵
    numEnteries=len(dataSet)    # 数据集长度(数据个数)
    labelCounts={}    # 用字典来统计每种结果的个数
    for featVec in dataSet:    # 统计数据集中不同结果的个数
        currentLabel=featVec[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1
    shannoEnt=0.0   # 熵
    for key in labelCounts:    # for循环遍历计算每个类别的熵,并求和得数据集得熵
        prob=float(labelCounts[key])/numEnteries   # 每种结果在数据集中的比例
        shannoEnt-=prob*log(prob,2)
    return shannoEnt    # 返回数据集的熵

def splitDataSet(dataSet,axis,value):    # 按照给定属性和属性值来划分数据集
    retDataSet=[]
    for featVec in dataSet:
        if featVec[axis]==value:
            reducedFeatVec=featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

def chooseBestFeatureTosplit(dataSet):    # 选最好的数据集划分方式
    numFeatures=len(dataSet[0])-1
    baseEntropy=calShang(dataSet)
    maxInfoGain=0.0    # 最大的信息增益
    minFeature=0    # 最大的信息增益的特征在所有属性中的位置
    for i in range(numFeatures):
        # 创建分类标签列表
        featList=[j[i] for j in dataSet]
        uniqueValues=set(featList)
        # 计算数据集划分后的信息增益
        newEntropy=0.0
        for value in uniqueValues:
            subDataSet=splitDataSet(dataSet,i,value)
            prob=len(subDataSet)/float(len(dataSet))
            newEntropy+=prob*calShang(subDataSet)
        infoGain=baseEntropy-newEntropy
        # 获取信息增益最大的特征
        if(infoGain>maxInfoGain):
            maxInfoGain=infoGain
            minFeature=i
    return minFeature

def majorityCnt(classList):
    classCount={}
    for i in classList:
        if i not in classCount.keys():
            classCount[i]=0
        classCount[i]+=1
    sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet,labels):
    classList=[j[-1] for j in dataSet]
    if classList.count(classList[0])==len(classList):    # 类别完全相同则
        return classList[0]                              # 停止继续划分
    if len(dataSet[0])==1:    # 划分到最后一个属性后
        return majorityCnt(classList)    # 返回数据集分类结果总次数最多的一个
    bestFeat=chooseBestFeatureTosplit(dataSet)    # 选择一个最好的属性作为节点,该属性在属性中的位置
    bestFeatLabel=labels[bestFeat]    # 作为节点的属性名
    mytree={bestFeatLabel:{}}
    del(labels[bestFeat])    # 删除已经选为节点的属性
    featValues=[j[bestFeat] for j in dataSet]    #
    uniqueValues=set(featValues)
    for value in uniqueValues:
        subLabels=labels[:]
        mytree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return mytree



def createDataSet():
    dataSet=[["sunny","hot","high","false","no"],
             ["sunny","hot","high","true","no"],
             ["overcast","hot","high","false","yes"],
             ["rainy","mild","high","false","yes"],
             ["rainy","cool","normal","false","yes"],
             ["rainy","cool","normal","true","no"],
             ["overcast","cool","normal","true","yes"],
             ["sunny","mild","high","false","no"],
             ["sunny","cool","normal","false","yes"],
             ["rainy","mild","normal","false","yes"],
             ["sunny","mild","normal","true","yes"],
             ["overcast","mild","high","true","yes"],
             ["overcast","hot","normal","false","yes"],
             ["rainy","mild","high","true","no"]]
    labels=['天气','温度','湿度','风','是否出去玩']
    return dataSet,labels

myData,labels=createDataSet()
print(createTree(myData,labels))
  • 2
    点赞
  • 18
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值