日常复习决策树

最新推荐文章于 2024-07-27 12:20:46 发布

百香果多冰少糖

最新推荐文章于 2024-07-27 12:20:46 发布

阅读量258

点赞数

文章标签：决策树机器学习 python

本文链接：https://blog.csdn.net/qq_57391626/article/details/127443984

版权

这一部分主要是对决策树代码理解的，读者可根据代码后面的注释进行理解。复制后可直接运行，后面的可视化、实例会单独发出来。

from math import log

#计算给定数据集的香农熵
def calcshannonent(dataset):
    numentries = len(dataset)
    labelcounts = {}
    for featvec in dataset:
        currentlabel = featvec[-1]
        if currentlabel not in labelcounts.keys():
            labelcounts[currentlabel] = 0
            labelcounts[currentlabel] +=1
    shannonent = 0.0
    for key in labelcounts:
        prob = float(labelcounts[key])/numentries
        shannonent -= prob * log(prob,2)
    return shannonent

#简单的测试集
def createdataset():
    dataset = [[1,1,'yes'],
              [1,1,'yes'],
              [1,0,'no'],
              [0,1,'no'],
              [0,1,'no']]
    label = ['no surfacing','flipper']
    return dataset,label

# mydat,labels = createdataset()
# calcshannonent(mydat)
# 0.9287712379549449

# #熵越高，混合的数据越多
# mydat[0][-1] = 'maybe'
# mydat
# [[1, 1, 'maybe'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]

# calcshannonent(mydat)
# 1.3931568569324173


#按照给定特征划分数据集
def splitdataset(dataset,axis,value):
    #dataset:数据集，axis:给划分数据集的特征，value:需返回的特征的值
    retdataset = []
    for featvec in dataset:
        if featvec[axis] == value:
            reducefeatvec = featvec[:axis]
            reducefeatvec.extend(featvec[axis+1:])
            retdataset.append(reducefeatvec)
    return retdataset

#选择最好的数据集划分方式
def choosebestfeaturetosplit(dataset):
    numfeatures = len(dataset[0]) - 1
    baseentropy = calcshannonent(dataset)
    bestinfogain = 0.0;bestfeature = -1
    for i in range(numfeatures):
        featlist = [example[i] for example in dataset] #将数据集dataset中第i列特征值写入新的list中
        uniquevals = set(featlist) #创建列表无重复元素值,且唯一属性
        newentropy = 0.0
        for value in uniquevals:
            subdataset = splitdataset(dataset,i ,value)
            prob = len(subdataset)/float(len(dataset))
            newentropy +=prob*calcshannonent(subdataset)
        infogain = baseentropy - newentropy
        if(infogain > bestinfogain):
            bestinfogain = infogain
            bestfeature = i
    return bestfeature


#数据集已处理所有属性，但类标签依然不唯一，采取多数表决来决定叶子节点的分类
def majoritycnt(classlist):
    classcount = {}
    for vote in classlist:#遍历所有行，标上所有类别出现的次数。这和KNN近邻算法中的classify0()非常类似
        if vote not in classcount.key():classcount[vote] = 0
        classcount[vote] +=1
    sortedclasscount = sorted(classcount.items(),key = operator.itengetter(1),reverse = True)
    return sortedclasscount[0][0]#返回次数最多的类别


# 创建数的函数代码
def createtree(dataset,labels):
    classlist = [example[-1] for example in dataset] #这里的意思是将dataset中的类标签数据作为一个新列表储存在classlist中
    if classlist.count(classlist[0]) == len(classlist):#判断classlist中类标签是否相同
        return classlist[0]
    if len(dataset[0]) == 1:#数据集划分包含多个类别的分组时，调用majoritycnt(),挑选次数最多的类别
        return majoritycnt(classlist)
    bestfeat = choosebestfeaturetosplit(dataset)#选择最好的特征进行分类
    bestfeatlabel = labels[bestfeat]#最好的特征对应的类别
    mytree = {bestfeatlabel:{}}#使用字典变量mytree存储树的信息,
    #del用法比较特殊，del语句作用在变量上，而不是数据对象上，意思就是del删除的是变量，而不是数据
    del(labels[bestfeat])
    
    featvalues = [example[bestfeat] for example in dataset]
    uniquevals = set(featvalues)
    #上面两行得到最好特征的所有属性值的列表
    for value in uniquevals:
        #解释sublabels = labels[:]--在python语言中函数参数是列表类型时，参数是按照引用方式传递的，为确保每次调
        #用createtree()时不改变原始列表的内容，使用新变量sublabels代替原始列表
        sublabels = labels[:]
        mytree[bestfeatlabel][value] = createtree(splitdataset(dataset,bestfeat,value),sublabels)
    return mytree


#主代码测试
mydat,labels = createdataset()
mytree = createtree(mydat,labels)
mytree
# {'flipper': {0: 'no', 1: {'no surfacing': {0: 'no', 1: 'yes'}}}}