简单的ID3决策树,C4.5的差异在于信息增益率。具体公式网上都有,只是简单验证下。
#coding:utf-8
import math
def calcShannonEnt(dataset):
numEntries = len(dataset)
labelCounts = {}#用于存放统计不同label的个数
for featVec in dataset:
currentLabel = featVec[-1] # 最后一位表示分类
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * math.log(prob, 2)
return shannonEnt
def CreateDataSet():
dataset = [['sunny', 'hot', 'high', 'weak', 'no'],
['sunny', 'hot', 'high', 'strong', 'no'],
['overcast', 'hot', 'high', 'weak', 'yes'],
['rain', 'mild', 'high', 'weak', 'yes'],
['rain', 'cool', 'normal', 'weak', 'yes'],
['rain', 'cool', 'normal', 'strong', 'no'],
['overcast', 'cool', 'normal', 'strong', 'yes'],
['sunny', 'mild', 'high', 'weak', 'no'],
['sunny', 'cool', 'normal', 'weak', 'yes'],
['rain', 'mild', 'normal', 'weak', 'yes'],
['sunny', 'mild', 'normal', 'strong', 'yes'],
['overcast', 'mild', 'high', 'strong', 'yes'],
['overcast', 'hot', 'normal', 'weak', 'yes'],
['rain', 'mild', 'high', 'strong', 'no'],
]
labels = ['outlook', 'temperature', 'humidity', 'wind']
return dataset, labels
# 选取属性axis的值value的样本表
def splitDataSet(dataSet, axis, value):
retDataSet = []
###构建分裂点属性新的列表
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
# 选取信息增益最大的属性作为节点
def chooseBestFeatureToSplit(dataSet):
numberFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numberFeatures):
featList = [example[i] for example in dataSet]
#print featList
uniqueVals = set(featList)
newEntropy = 0.0
split_infor = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
split_infor -= prob * math.log(prob, 2)#分裂点熵信息
#infoGain = baseEntropy - newEntropy####ID3信息增益
infoGain = (baseEntropy - newEntropy)/float(split_infor)###c4.5信息增益率
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
# 对于属性已经用完,仍然没有分类的情况,采用投票表决的方法
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys():
classCount[vote] = 0
classCount[vote] += 1
return max(classCount)
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]#存放数据分类label 用于判断递归停止条件
# 类别相同停止划分
if classList.count(classList[0]) == len(classList):
return classList[0]
# 属性用完,投票表决
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)#得到最大增益的属性的结点索引
bestFeatLabel = labels[bestFeat]#得到属性名称
myTree = {bestFeatLabel: {}}#保存构建决策树
del (labels[bestFeat])#删除已经用过的属性值
featValues = [example[bestFeat] for example in dataSet]#得到最佳分裂点属性的值用于寻找下一个分裂点
uniqueVals = set(featValues)
###递归调用 构建决策树
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
return myTree
myDat, labels = CreateDataSet()
tree = createTree(myDat, labels)
print tree