机器学习之ID3决策树

简单的ID3决策树,C4.5的差异在于信息增益率。具体公式网上都有,只是简单验证下。

#coding:utf-8

import math

# 香农公式计算信息熵
def calcShannonEnt(dataset):
    numEntries = len(dataset)
    labelCounts = {}#用于存放统计不同label的个数
    for featVec in dataset:
        currentLabel = featVec[-1]  # 最后一位表示分类
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / numEntries
        shannonEnt -= prob * math.log(prob, 2)
    return shannonEnt


def CreateDataSet():
    dataset = [['sunny', 'hot', 'high', 'weak', 'no'],
               ['sunny', 'hot', 'high', 'strong', 'no'],
               ['overcast', 'hot', 'high', 'weak', 'yes'],
               ['rain', 'mild', 'high', 'weak', 'yes'],
               ['rain', 'cool', 'normal', 'weak', 'yes'],
               ['rain', 'cool', 'normal', 'strong', 'no'],
               ['overcast', 'cool', 'normal', 'strong', 'yes'],
               ['sunny', 'mild', 'high', 'weak', 'no'],
               ['sunny', 'cool', 'normal', 'weak', 'yes'],
               ['rain', 'mild', 'normal', 'weak', 'yes'],
               ['sunny', 'mild', 'normal', 'strong', 'yes'],
               ['overcast', 'mild', 'high', 'strong', 'yes'],
               ['overcast', 'hot', 'normal', 'weak', 'yes'],
               ['rain', 'mild', 'high', 'strong', 'no'],
               ]
    labels = ['outlook', 'temperature', 'humidity', 'wind']
    return dataset, labels


# 选取属性axis的值value的样本表
def splitDataSet(dataSet, axis, value):
    retDataSet = []
    ###构建分裂点属性新的列表
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[:axis]
            reducedFeatVec.extend(featVec[axis + 1:])
            retDataSet.append(reducedFeatVec)

    return retDataSet


# 选取信息增益最大的属性作为节点
def chooseBestFeatureToSplit(dataSet):
    numberFeatures = len(dataSet[0]) - 1
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1

    for i in range(numberFeatures):
        featList = [example[i] for example in dataSet]
        #print featList
        uniqueVals = set(featList)
        newEntropy = 0.0
        split_infor = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet, i, value)
            prob = len(subDataSet) / float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet)
            split_infor -= prob * math.log(prob, 2)#分裂点熵信息
        #infoGain = baseEntropy - newEntropy####ID3信息增益
        infoGain = (baseEntropy - newEntropy)/float(split_infor)###c4.5信息增益率
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature


# 对于属性已经用完,仍然没有分类的情况,采用投票表决的方法
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1
    return max(classCount)


def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]#存放数据分类label 用于判断递归停止条件
    # 类别相同停止划分
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    # 属性用完,投票表决
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    bestFeat = chooseBestFeatureToSplit(dataSet)#得到最大增益的属性的结点索引
    bestFeatLabel = labels[bestFeat]#得到属性名称
    myTree = {bestFeatLabel: {}}#保存构建决策树
    del (labels[bestFeat])#删除已经用过的属性值
    featValues = [example[bestFeat] for example in dataSet]#得到最佳分裂点属性的值用于寻找下一个分裂点
    uniqueVals = set(featValues)
    ###递归调用 构建决策树
    for value in uniqueVals:
        subLabels = labels[:]
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
    return myTree


myDat, labels = CreateDataSet()
tree = createTree(myDat, labels)
print tree


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值