# -*- coding: utf-8 -*-
from math import log
import operator
import pickle
import sys
'''
输入:原始数据集、子数据集(最后一列为类别标签,其他为特征列)
功能:计算原始数据集、子数据集(某一特征取值下对应的数据集)的香农熵
输出:float型数值(数据集的熵值)
ent(D)=-sum(plog2p)
'''
def calcShannonEnt(dataset):
numSamples = len(dataset)
labelCounts = {}
for allFeatureVector in dataset:
currentLabel = allFeatureVector[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
entropy = 0.0
for key in labelCounts:
property = float(labelCounts[key])/numSamples
entropy -= property * log(property,2)
return entropy
'''
输入:无
功能:封装原始数据集
输出:数据集、特征标签
'''
def creatDataSet():
dataset = [[1,1,1,'yes'],[1,1,0,'yes'],[1,0,1,'no'],[0,1,0,'no'],[0,0,1,'no']]
labels = ['no surfacing','flippers']
return dataset,labels
'''
输入:数据集、数据集中的某一特征所在列的索引、该特征某一可能取值(例如,(原始数据集、0,1 ))
功能:取出在该特征取值下的子数据集(子集不包含该特征)
输出:子数据集
'''
def getSubDataset(dataset,colIndex,value):
subDataset = [] #用于存储子数据集
for rowVector in dataset:
if rowVector[colIndex] == value:
#下边两句实现抽取除第colIndex列特征的其他特征取值
subRowVector = rowVector[:colIndex]
subRowVector.extend(rowVector[colIndex+1:])
#将抽取的特征行添加到特征子数据集中
subDataset.append(subRowVector)
#print((dataset),(subDataset))
#print(len(dataset),len(subDataset))
return subDataset
#划分数据集
def getSubDataset_(dataset,colIndex,value):
subDataset = [] #用于存储子数据集
for rowVector in dataset:
if rowVector[colIndex] == value:
subDataset.append(rowVector)
return subDataset
'''
输入:数据集
功能:选择最优的特征,以便得到最优的子数据集(可简单的理解为特征在决策树中的先后顺序)
输出:最优特征在数据集中的列索引
'''
def BestFeatToGetSubdataset(dataset):
#下边这句实现:除去最后一列类别标签列剩余的列数即为特征个数
numFeature = len(dataset[0]) - 1
baseEntropy = calcShannonEnt(dataset)
bestInfoGain = 0.0; bestFeature = -1
for i in range(numFeature):#i表示该函数传入的数据集中每个特征
# 下边这句实现抽取特征i在数据集中的所有取值
feat_i_values = [example[i] for example in dataset]
uniqueValues = set(feat_i_values)
feat_i_entropy = 0.0
for value in uniqueValues:
subDataset = getSubDataset_(dataset,i,value)
#下边这句计算pi
prob_i = len(subDataset)/float(len(dataset))
feat_i_entropy += prob_i * calcShannonEnt(subDataset)
infoGain_i = baseEntropy - feat_i_entropy
if (infoGain_i > bestInfoGain):
bestInfoGain = infoGain_i
bestFeature = i
return bestFeature #特征的索引
'''
输入:子数据集的类别标签列
功能:找出该数据集个数最多的类别
输出:子数据集中个数最多的类别标签
'''
def mostClass(ClassList):
classCount = {}
for class_i in ClassList:
if class_i not in classCount.keys():
classCount[class_i] = 0
classCount[class_i] += 1
sortedClassCount = sorted(classCount.iteritems(),
key=operator.itemgetter(1),reverse = True)
return sortedClassCount[0][0]
'''
输入:数据集,特征标签
功能:创建决策树(直观的理解就是利用上述函数创建一个树形结构)
输出:决策树(用嵌套的字典表示)
'''
def creatTree(dataset,labels):
classList = [example[-1] for example in dataset]
#判断传入的dataset中是否只有一种类别,是,返回该类别
if classList.count(classList[0]) == len(classList):
return classList[0]
#判断是否遍历完所有的特征,是,返回个数最多的类别
if len(dataset[0]) == 1:
return mostClass(classList)
#找出最好的特征划分数据集
bestFeat = BestFeatToGetSubdataset(dataset)
#找出最好特征对应的标签
bestFeatLabel = labels[bestFeat]
#搭建树结构
myTree = {bestFeatLabel:{}}
del (labels[bestFeat])
#抽取最好特征的可能取值集合
bestFeatValues = [example[bestFeat] for example in dataset]
uniqueBestFeatValues = set(bestFeatValues)
for value in uniqueBestFeatValues:
#取出在该最好特征的value取值下的子数据集和子标签列表
subDataset = getSubDataset(dataset,bestFeat,value)
subLabels = labels[:]
#递归创建子树
myTree[bestFeatLabel][value] = creatTree(subDataset,subLabels)
return myTree
'''
输入:测试特征数据
功能:调用训练决策树对测试数据打上类别标签
输出:测试特征数据所属类别
'''
def classify(inputTree,featlabels,testFeatValue):
#print('-',inputTree)
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featlabels.index(firstStr)
for firstStr_value in secondDict.keys():
if testFeatValue[featIndex] == firstStr_value:
if type(secondDict[firstStr_value]).__name__ == 'dict':
classLabel = classify(secondDict[firstStr_value],featlabels,testFeatValue)
else: classLabel = secondDict[firstStr_value]
return classLabel
if __name__ == '__main__':
dataset,labels = creatDataSet()
print(dataset)
storelabels = labels[:]#复制label
trainTree = creatTree(dataset,labels)
classlabel = classify(trainTree,storelabels,[1,0,0])
print(classlabel)
python实现ID3决策树
最新推荐文章于 2024-04-30 15:43:43 发布