ml in action 第3章 ,构建决策树

from math import log
import  operator
def calcShannnonEnt (dataset):
    numEntries = len(dataset)
    labelcounts = {}

    for featVec in dataset:
        currentLabel = featVec[-1]
        if currentLabel not in labelcounts.keys():
            labelcounts[currentLabel] = 0
        labelcounts[currentLabel] += 1

    shannonEnt = 0.0
    for key  in labelcounts.keys():
        prob = float(labelcounts[key]) / numEntries
        shannonEnt -= prob*log(prob , 2)

    return shannonEnt

def createDataSet():
    dataset = [
        [1 ,1 , 'yes'],
        [1 ,1 , 'yes'],
        [1, 0 , 'no'],
        [0 ,1 , 'no'],
        [0, 1, 'no']
    ]
    labels = ['no surfacing' , 'flippers']
    return dataset,labels

def splitDataSet(dataSet , axis ,value):
    retDataSet = []
    for featVec in dataSet:
        if featVec[axis] == value:
            reducedFeatVec = featVec[ : axis]
            reducedFeatVec.extend(featVec[axis+1 : ])
            retDataSet.append(reducedFeatVec)

    return retDataSet

def chooseBestFeatureToSplit(dataset):
    numFeatures = len(dataset[0])-1

    baseEntropy = calcShannnonEnt(dataset)

    bestInfoGain = 0.0
    bestFeature = -1

    for i in range(numFeatures):
        featValues = [example[i] for example in dataset]
        uniqueVals = set(featValues)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataset,i,value)
            prob = len(subDataSet)/float(len(dataset))
            newEntropy += prob*calcShannnonEnt(subDataSet)

        infoGain =baseEntropy - newEntropy
        if infoGain > bestInfoGain:
            bestInfoGain = infoGain
            bestFeature = i;

    return bestFeature

def majorityCnt(classList):
    classCount = {}
    for i in classList:
        if i not in classCount.keys():
            classCount[i] = 0
        classCount[i]+=1

    sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1) , reverse=True)
    return sortedClassCount[0][0]

def createTree(dataset , labels):
    calssList = [example[-1] for example in dataset]
    len(calssList)
    if(calssList.count(calssList[0]) == len(calssList)):
        return calssList[0]
    if( len(dataset[0]) == 1):
        return majorityCnt(calssList)

    bestFeat = chooseBestFeatureToSplit(dataset)
    bestFeatLabel = labels[bestFeat]
    del(labels[bestFeat])

    tree = { bestFeatLabel : {}}

    featValues = [example[bestFeat] for example in dataset]
    uniqueVals = set(featValues)

    for val in uniqueVals:
        sublabels =labels[:]
        tree[bestFeatLabel][val]=createTree(splitDataSet(dataset,bestFeat,val),
                                               sublabels)
    return tree

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值