from math import log
#以决策为标准计算信息熵
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob *log(prob,2)
return shannonEnt
def creatDataSet():
dataSet = [[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
labels = ['no surfacing','flippers']
return dataSet,labels
def splitDataSet(dataSet,axis,value):
retDataSet = []#根据特征新建链表
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
#选择信息熵最小的最佳特征
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0])-1
baseEntropy = calcShannonEnt(dataSet)#比较标准为最末尾的特征
bestInfoGain =0.0;bestFeature =-1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]#建立特征i的链表
uniqueVals = set(featList)#set不允许重复
newEntropy =0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value)
prob = len(subDataSet)/float(len(dataSet))#计算满足特征i为value的概率
newEntropy += prob*calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if(infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
#
def majorityCnt(classList):
classCount ={}
for vote in classList:
if vote not in classCount.keys():classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reverse = True)
return sortedClassCount[0][0]
#构建递归决策树
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if classList.count(classList[0])==len(classList):#如果决策全相同,则停止分割
return classList[0]
if len(dataSet[0])==1:#没有特征了,则返回出现次数多的
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)#选择最佳的特征的下坐标
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])#删除这个特征
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)#最佳特征的值set
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
机器学习实战——决策树
最新推荐文章于 2023-12-19 18:51:24 发布
本文介绍了如何使用Python实现决策树算法,包括计算信息熵、创建数据集、选择最佳特征进行分割,以及构建递归决策树的过程。通过示例展示了如何通过特征选择和概率计算来构建决策树模型。
摘要由CSDN通过智能技术生成