python实现ID3决策树

最新推荐文章于 2024-04-30 15:43:43 发布

xgyqaz

最新推荐文章于 2024-04-30 15:43:43 发布

阅读量427

点赞数 1

分类专栏：机器学习文章标签：决策树 id3

机器学习专栏收录该内容

1 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-
from math import log
import operator
import pickle
import sys   

'''
输入：原始数据集、子数据集（最后一列为类别标签，其他为特征列）
功能：计算原始数据集、子数据集（某一特征取值下对应的数据集）的香农熵
输出：float型数值（数据集的熵值）
ent(D)=-sum(plog2p)
'''
def calcShannonEnt(dataset):
    numSamples = len(dataset)
    labelCounts = {}
    for allFeatureVector in dataset:
        currentLabel = allFeatureVector[-1]
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1
    entropy = 0.0
    for key in labelCounts:
        property = float(labelCounts[key])/numSamples
        entropy -= property * log(property,2)
    return entropy

'''
输入：无
功能：封装原始数据集
输出：数据集、特征标签
'''     
def creatDataSet():
    dataset = [[1,1,1,'yes'],[1,1,0,'yes'],[1,0,1,'no'],[0,1,0,'no'],[0,0,1,'no']]
    labels = ['no surfacing','flippers']
    return dataset,labels

'''
输入：数据集、数据集中的某一特征所在列的索引、该特征某一可能取值（例如，（原始数据集、0,1 ））
功能：取出在该特征取值下的子数据集（子集不包含该特征）
输出：子数据集
'''
def getSubDataset(dataset,colIndex,value):
    subDataset = [] #用于存储子数据集
    for rowVector in dataset:
        if rowVector[colIndex] == value:
            #下边两句实现抽取除第colIndex列特征的其他特征取值
            subRowVector = rowVector[:colIndex]
            subRowVector.extend(rowVector[colIndex+1:])
            
            #将抽取的特征行添加到特征子数据集中
            subDataset.append(subRowVector)
    #print((dataset),(subDataset))
    #print(len(dataset),len(subDataset))
    return subDataset

#划分数据集
def getSubDataset_(dataset,colIndex,value):
    subDataset = [] #用于存储子数据集
    for rowVector in dataset:
        if rowVector[colIndex] == value:
            subDataset.append(rowVector)
    return subDataset

'''
输入：数据集
功能：选择最优的特征，以便得到最优的子数据集（可简单的理解为特征在决策树中的先后顺序）
输出：最优特征在数据集中的列索引
'''
def BestFeatToGetSubdataset(dataset):
    #下边这句实现：除去最后一列类别标签列剩余的列数即为特征个数
    numFeature = len(dataset[0]) - 1 
    baseEntropy = calcShannonEnt(dataset)
    bestInfoGain = 0.0; bestFeature = -1
    for i in range(numFeature):#i表示该函数传入的数据集中每个特征
        # 下边这句实现抽取特征i在数据集中的所有取值
        feat_i_values = [example[i] for example in dataset]
        uniqueValues = set(feat_i_values)
        feat_i_entropy = 0.0
        for value in uniqueValues:
            subDataset = getSubDataset_(dataset,i,value)
            #下边这句计算pi
            prob_i = len(subDataset)/float(len(dataset))
            feat_i_entropy += prob_i * calcShannonEnt(subDataset)
        infoGain_i = baseEntropy - feat_i_entropy
        if (infoGain_i > bestInfoGain):
            bestInfoGain = infoGain_i
            bestFeature = i
    return bestFeature #特征的索引

'''
输入：子数据集的类别标签列
功能：找出该数据集个数最多的类别
输出：子数据集中个数最多的类别标签
'''  
def mostClass(ClassList):
    classCount = {}
    for class_i in ClassList:
        if class_i not in classCount.keys():
            classCount[class_i] = 0
        classCount[class_i] += 1
    sortedClassCount = sorted(classCount.iteritems(),
    key=operator.itemgetter(1),reverse = True)        
    return sortedClassCount[0][0]

'''
输入：数据集，特征标签
功能：创建决策树（直观的理解就是利用上述函数创建一个树形结构）
输出：决策树（用嵌套的字典表示）
'''    
def creatTree(dataset,labels):
    classList = [example[-1] for example in dataset]
    #判断传入的dataset中是否只有一种类别，是，返回该类别
    if classList.count(classList[0]) == len(classList):    
        return classList[0]
    #判断是否遍历完所有的特征,是，返回个数最多的类别
    if len(dataset[0]) == 1:
        return mostClass(classList)
    #找出最好的特征划分数据集
    bestFeat = BestFeatToGetSubdataset(dataset)

    #找出最好特征对应的标签
    bestFeatLabel = labels[bestFeat]
    #搭建树结构
    myTree = {bestFeatLabel:{}}

    del (labels[bestFeat])
    #抽取最好特征的可能取值集合
    bestFeatValues = [example[bestFeat] for example in dataset]
    
    uniqueBestFeatValues = set(bestFeatValues)

    for value in uniqueBestFeatValues:
        #取出在该最好特征的value取值下的子数据集和子标签列表
        subDataset = getSubDataset(dataset,bestFeat,value)
        subLabels = labels[:]
        #递归创建子树
        myTree[bestFeatLabel][value] = creatTree(subDataset,subLabels)
    return myTree

'''
输入：测试特征数据
功能：调用训练决策树对测试数据打上类别标签
输出：测试特征数据所属类别
'''        
def classify(inputTree,featlabels,testFeatValue):
    #print('-',inputTree)
    firstStr = list(inputTree.keys())[0]
    secondDict = inputTree[firstStr]
    featIndex = featlabels.index(firstStr)
    for firstStr_value in secondDict.keys():
        if testFeatValue[featIndex] == firstStr_value:
            if type(secondDict[firstStr_value]).__name__ == 'dict':
                classLabel = classify(secondDict[firstStr_value],featlabels,testFeatValue)
            else: classLabel = secondDict[firstStr_value]
    return classLabel    

if __name__ == '__main__':
    dataset,labels = creatDataSet()
    print(dataset)
    storelabels = labels[:]#复制label
    trainTree = creatTree(dataset,labels) 
    classlabel = classify(trainTree,storelabels,[1,0,0])
    print(classlabel)

转：https://blog.csdn.net/LY_ysys629/article/details/72639568

xgyqaz

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python实现ID3决策树

# -*- coding: utf-8 -*-from math import logimport operatorimport pickleimport sys '''输入：原始数据集、子数据集（最后一列为类别标签，其他为特征列）功能：计算原始数据集、子数据集（某一特征取值下对应的数据集）的香农熵输出：float型数值（数据集的熵值）ent(D)=-sum(plog2p...
复制链接

扫一扫

专栏目录