决策树搬运

note:

python3使用keys返回的不是不是list类型需要转换

利用.__name__可以判断他的类型,直接type判断没结果

a = [example[0] for example in list] 可以把双重列表里的某位值取出来获得新的列表

决策树需要读取数据,需要计算香农熵模块,需要划分数据计算香农熵,最后建树

from math import log
import operator

def get_data():
        data = [[1, 1, 'yes'],
                [1, 0, 'no'],
                [0, 1, 'no'],
                [0, 1, 'no']]
        label = ['sb', 'handsonboy']
        return label, data

def caclulate_shannone(data):
        """"计算香农熵"""
        labelcounts = {}
        for item in data:
                currlabel= item[-1]
                if currlabel not in labelcounts.keys():
                        labelcounts[currlabel] = 0
                labelcounts[currlabel] += 1
        shannonent = 0.0
        for key in labelcounts:
                prob = float(labelcounts[key])/len(data)
                shannonent -= prob * log(prob, 2)
        return shannonent


def split_data(data, axis, value):
        "分割数据,传入data数据,axis是第几个特征,如果该值等于特征列表里取出的值就减少"
        retdata = []
        for featvec in data:
                if featvec[axis] == value:
                        reduecefeat = featvec[:axis]
                        reduecefeat.extend(featvec[axis+1:])
                        retdata.append(reduecefeat)
        return retdata

def choosefeatrue(label, data, shannonent):
        numfeatrue = len(data[0]) - 1
        baseentry = shannonent
        bestinfogain = 0.0
        bestfeature = -1
        for i in range(numfeatrue):
                featlist = [example[i] for example in data]
                uniquevals = set(featlist)    # 将他变成字典类型
                newentropy = 0.0
                # 计算最佳特征
                for value in uniquevals:
                        subdata = split_data(data, i, value)
                        prob = len(subdata)/float(len(data))
                        newentropy += prob * caclulate_shannone(subdata)
                infogain = baseentry - newentropy
                if infogain > bestinfogain:
                        bestinfogain = infogain
                        bestfeature = i
        return bestfeature

def get_mytree(data, label, shannoneent):
        classlist = [example[-1] for example in data]
        if classlist.count(classlist[0]) == len(classlist):
                return classlist[0]
        if len(data[0]) == 1:
                return major(classlist)
        bestfeat = choosefeatrue(label, data, shannoneent)
        bestfeatlabel = label[bestfeat]
        mytree = {bestfeatlabel:{}}
        del(label[bestfeat])
        featvalues = [example[bestfeat] for example in data]    # 最好描述特征的那一位的全部值取出
        uniquevals = set(featvalues)
        for value in uniquevals:
                sublabel = label[:]
                mytree[bestfeatlabel][value] = get_mytree(split_data(data, bestfeat, value), sublabel, shannoneent)
        return mytree

def major(classlist):
        classcount = {}
        for vote in classlist:
                if vote not in classcount.keys():
                        classcount[vote] = 0
                classcount[vote] += 1
        sortclasscount = sorted(classcount.iteritems(), key = operator.itemgetter(1), reverse=True)
        return sortclasscount[0][0]

def classify(inputtree, featlabels, testvec):
        firstStr = list(inputtree.keys())[0]    # 3返回的类型不是列表需要转换
        secondDict = inputtree[firstStr]
        featindex = featlabels.index(firstStr)
        for key in secondDict.keys():
                if testvec[featindex] == key:
                        if type(secondDict[key]).__name__ == 'dict':    # 用__name__来表示他是字典类型
                                classlabel = classify(secondDict[key], featlabels, testvec)
                        else:
                                classlabel = secondDict[key]
        return classlabel

if __name__ == "__main__":
        data = get_data()
        shannoneent = caclulate_shannone(data[1])
        mytree = get_mytree(data[1], data[0], shannoneent)
        data = get_data()
        print(classify(mytree, data[0], [1, 1]))

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值