决策树

from math import log
import operator
def calcShannonEnt(dataSet):
    numEntries =len(dataSet)
    labelCount = {}
    for featVoc in dataSet:
        currentlabel = featVoc[-1]
        if currentlabel not in labelCount.keys():
            labelCount[currentlabel]=0
        labelCount[currentlabel] +=1
    shannonEnt = 0.0
    for key in labelCount:
        prob =float(labelCount[key])/numEntries
        # print(prob)
        shannonEnt -=prob*log(prob,2)
    return shannonEnt

def CreateDataSet():
    dataSet = [[1,1,'yes'],
               [1, 1, 'yes'],
               [1,0,'no'],
               [1,1,'no'],
               [1,0,'no']]
    label =['no surfacing','flippers']
    print(dataSet,label)
    return  dataSet,label

#                                                                                       [2,4,3]                               | 2 4|
def splitDataSet(dataSet,axis,value):   #抽取一个矩阵的指定列与value相等除外的其他列数据[1,2,2]    splitDataSet(data,1,3)==>  |    |
                                                                                       # [4,5,3]                              |4,6 |
    reData=[]
    for featVec in dataSet:
        if featVec[axis] == value:
            reduceFeatVec = featVec[:axis]
            reduceFeatVec.extend(featVec[axis+1:])
            # print(featVec[:axis],"---",featVec[axis+1:])
            # print(reduceFeatVec)
            reData.append(reduceFeatVec)
    return reData


def chaooseBestFeatureTosplit(dataSet):
    num = len(dataSet[0])-1
    baseEntropy = calcShannonEnt(dataSet)
    bestinfoGain = 0.0
    beseFeature = -1
    for i in range(num):
        featList = [x[i] for x in dataSet]
        # print(featList)
        uniqueVals = set(featList)
        # print("uniquesVals = ",uniqueVals)
        newEntropy=0.0
        for j in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,j)
            prob = len(subDataSet)/len(dataSet)
            newEntropy += prob*calcShannonEnt(subDataSet)
        infoGain = baseEntropy-newEntropy
        # print(infoGain)
        if infoGain > bestinfoGain:
            beseFeature = i
            bestinfoGain = infoGain
    return beseFeature

def majoritycnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote] +=1
        sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse=True)
        return sortedClassCount
def CreateTree(dataSet,labes):
    print(dataSet,'label ',labes)
    classList = [x[-1] for x in dataSet]
    cnt = len(set(classList))# classList.count(classList[0])  集合不能有重复元素
    # print(cnt,"   ",len(classList))
    # print(len(classList[-1]))
    if classList.count(classList[0])== len(classList): #除开最后一列,剩下的矩阵中第一列元素不相同数 == 行数
        print("aaaaa")
        return  classList[0]
    # print(len(dataSet[0]))
    if len(dataSet[0])==1:
        print("bbbbb")
        return majoritycnt(classList)
    bestFeature = chaooseBestFeatureTosplit(dataSet)
    bestFeatureLabe = labes[bestFeature]
    myTree = {bestFeatureLabe:{}}
    del labes[bestFeature]
    featValues = [x[bestFeature] for x in dataSet]
    uniqueVals  = set(featValues)
    # print(uniqueVals)

    # print(splitData)
    for value in uniqueVals:
        subLabel = labes[:]
        # print(subLabel)
        # print('-------',myTree)
        myTree[bestFeatureLabe][value]=CreateTree(splitDataSet(dataSet, bestFeature, value),subLabel)
        # print(myTree)
    return myTree



dataSet,labels = CreateDataSet();
# print(chaooseBestFeatureTosplit(dataSet))
print(CreateTree(dataSet, labels))

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Car12

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值