决策树算法的完整实现

from math import log
import operator
#计算的是经验熵H(D)
def calcShannonEnt(dataset):
    numEntries = len(dataset)
    labelCounts = {}
    
    for featVec in dataset:
        currentLabel = featVec[-1]
        if currentLabel not in labelCounts.keys():
            #key对应的value值赋1
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    shannonEnt = 0.0
    for key in labelCounts:
        #print(labelCounts)
        prob = float(labelCounts[key]) / numEntries
        #print('prob',prob)
        shannonEnt -= prob * log(prob,2)
        #print('shan',shannonEnt)
    return shannonEnt

def createdatset():
    dataset = [[0,0,0,0,'no'],
               [0,0,0,1,'no'],
               [0,1,0,1,'yes'],
               [0,1,1,0,'yes'],
               [0,0,0,0,'no'],
               
               [1,0,0,0,'no'],
               [1,0,0,1,'no'],
               [1,1,1,1,'yes'],
               [1,0,1,2,'yes'],
               [1,0,1,2,'yes'],
               
               [2,0,1,2,'yes'],
               [2,0,1,1,'yes'],
               [2,1,0,1,'yes'],
               [2,1,0,2,'yes'],
               [2,0,0,0,'no']
             ]
               
    labels = ['age','work','house','信贷']
    return dataset,labels

def splitDataset(dataSet, axis, value):
    retDataset = []
    
    for featVec in dataSet:
        if featVec[axis] == value:
            reduceFeatVec = featVec[0:axis]
            reduceFeatVec.extend(featVec[axis+1:])
            #print(reduceFeatVec)
            retDataset.append(reduceFeatVec)
    return retDataset

def chooseBestFeatureToSplit(dataset):
    numFeatures = len(dataset[0]) - 1
    baseEntropy = calcShannonEnt(dataset)
    
    bestInfoGain = 0.0
    bestFeature = -1
    
    for i in range(numFeatures):

        featList = [example[i] for example in dataset]
        uniqueVals = set(featList)
        #print(uniqueVals)
        newEntropy = 0.0
        
        for value in uniqueVals:
            subDataset = splitDataset(dataset,i,value)
            #print(subDataset)
            #计算某个特征中各取值总量 占总特征数的比例
            prob = len(subDataset)/float(len(dataset))
            #计算经验条件熵H(D|A)
            newEntropy += prob * calcShannonEnt(subDataset)
        infoGain = baseEntropy - newEntropy
        if(infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

#多数表决
def majority(classlist):
    classcount = {}
    for vote in classlist:
        if vote not in classlist.keys():
            classcount[vote] = 0
        classcount += 1
    sortedclasscount = sorted(classcount.items(),operator.itemgetter(1),reverse=True)

    return sortedclasscount[0][0]

def createtree(dataset,labels):
    classlist = [ example[-1] for example in dataset]
    if classlist.count(classlist[0]) == len(classlist):
        return classlist[0]
    #只有1列,即计算过所有的特征,而splitDataset需要进行些许的更改
    if len(dataset[0]) == 1:
        return majority(dataset)
    bestfeature = chooseBestFeatureToSplit(dataset)
    #返回的是label而不是label对应的列数
    bestfeaturelabel = labels[bestfeature]
    #print(bestfeaturelabel)
    #此为嵌套型字典
    mytree = {bestfeaturelabel:{}}
    #删除当前情况下的最好特征,避免重复
    del(labels[bestfeature])
    #对该最佳特征中的取值进行去重
    featurevalues = [example[bestfeature] for example in dataset]
    uniquevals = set(featurevalues)
    
    for value in uniquevals:
        sublabels = labels[:]
        #bestfeaturelabel是当前最好的label,value是此特征下的取值,而createtree最终返回的是yes或no
        mytree[bestfeaturelabel][value] = createtree(splitDataset(dataset,bestfeature,value),sublabels)
    return mytree

dataset,labels = createdatset()
#majority(dataset)
mytree = createtree(dataset,labels)
print(mytree)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值