香农熵:集合信息的度量方式
定义为:信息的期望值
计算所有类别所有可能值包含的信息期望值:H = sum(-p(xi)*log2p(xi)) (1<=i<=n) n为分类的数目
# -*- coding: utf-8 -*-
from math import log
def calcShannonEnt(dataset): #计算香农熵
numEntries = len(dataset) #计算样本个数
labelCounts = {} #创建一个字典储存类别
for featVec in dataset:
currentLabel = featVec[-1] #一般数据最后一列都是分类情况
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel]+=1
shannonEnt = 0.0 #香农熵
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries #计算当前类别在总类别里的概率
shannonEnt -= (prob*log(prob,2)) #log(x,y)代表以y为底x的对数
return shannonEnt
def createDataSet(): #拟定自己的数据
dataSet = [
[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']
]
labels = ['no surfacing' , 'flippers']
return dataSet,labels
输出测试效果:
# -*- coding: utf-8 -*-
import trees
myDat , labels = trees.createDataSet()
print myDat
print trees.calcShannonEnt(myDat)
得到:
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
0.970950594455
3.1.2划分数据集:特征可能有好多个,判断按照哪个特征划分数据集是最好的划分
大概思路就是按照第n列特征为m的列表进行划分
def splitDataSet(dataSet,axis,value): #三个参数:待划分的参数集,划分数据集的特征,需要返回的特征的值
#按照第axis列作为参考划分出值为value的列表
retDataSet = []#新创建一个列表 将符合要求的假如
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis] #按照axis划分参数集,把axis拿出去
reducedFeatVec.extend(featVec[axis+1:]) #extend方法:得到一个新的列表
retDataSet.append(reducedFeatVec)#append方法:得到包含a~b的列表
return retDataSet
测试结果
print myDat
print trees.splitDataSet(myDat,0,1) #划分mydata,按照featvec[0] == 1 划分 ;即按照第0(1列)划分出特征为1的
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
[[1, 'yes'], [1, 'yes'], [0, 'no']]
3.3
PS:之前一直纠结于到底啥是香农熵,然后挑灯夜战研究了一下《统计学习方法》书中的决策树搞了一下理论,现在看代码就感觉很清晰了;
突然觉得,一边看理论一边实践也是个不错的搞法。顺便吐槽一下学校两个大数据实验室,找不到一个能给窝讲决策树的学长学姐;
def chooseBestFeatureToSplit(dataSet): #选择最好的数据划分方式
numFeature = len(dataSet[0])-1 #算出数据个数
baseEntropy = calcShannonEnt(dataSet) #计算香农熵
bestInfoGain = 0.0 ; bestFeature = -1
for i in range(numFeature):
featList = [example[i] for example in dataSet] #创建分类标签
uniqueVals = set(featList) #去重
nuwEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet,i,value) #划分比较
prob = len(subDataSet)/float(len(dataSet)) #Di/D
nuwEntropy += prob*calcShannonEnt(subDataSet)
infoGain = baseEntropy - nuwEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
PS:香农熵本身就是信息增益,按照公式来就OK咯;
测试:
import trees
myDat , labels = trees.createDataSet()
print(trees.chooseBestFeatureToSplit(myDat))
print(myDat)
测试输出:0 #意味着选第0列是最好的划分特征
[[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
#可以输出myDat来研究一下
3-3,3-4
当构建树之前要先处理一下特征,如果类标签仍旧不是唯一,就需要考虑如何定义叶子节点,因此用多数表决的方式;
递归建树就是不断的寻求可以划分的,一样特征的就停止;
def majorityCnt(classLIst) : #多数表决
classCount = {}
for vote in classLIst:
if vote not in classCount.keys() : classCount[vote] = 0
classCount[vote]+=1
sortedClassCount = sorted(classCount.iteritems(),key = operator.itemgetter(1),reversed=True)
return sortedClassCount[0][0] # 返回最适合定义的叶子节点
def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet] #创建包含数据集所有类标签的列表
if classList.count(classList[0]) == len(classList): #类别如果完全相同则停止继续划分
return classList[0]
if len(dataSet[0])==1 :#如果所有特征都被使用完,则利用投票方法选举出类标签返回
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet) # 获取最好的数据集划分方式
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat]) #删除结点递归
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues) #得到列表包含的所有属性值
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
效果:
import trees
myDat , labels = trees.createDataSet()
myTree = trees.createTree(myDat,labels)
print(myTree)
效果:{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
3.2.1 : 模拟建树并且使用箭头指向:
import matplotlib.pyplot as plt
#定义文本框和箭头格式
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
#绘制带箭头的注解
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
def createPlot():
fig = plt.figure(1, facecolor='white')
fig.clf()#绘制并且清空图
createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses
plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode) #绘制两个不同的节点
plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
plt.show()
效果预览:
3.2.2 构造注解树
要绘制一棵树,首先要知道叶子节点,以及树的深度;在python中一般用字典对树进行存储;
这样可以确定x轴的长度和y轴的高度
3-6:获取叶节点数和树的层数:
def getNumLeafs(myTree): #获取叶节点个数:利用字典存树信息
numLeafs = 0
firstStr = myTree.keys()[0] #从父节点开始可以遍历整棵树
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict': #判断数据类型是不是字典类型
numLeafs += getNumLeafs(secondDict[key]) #递归调用累计叶子节点个数,返回值
else:
numLeafs+=1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstStr = myTree.keys()[0]#获取父节点
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__ == 'dict':
thisDepth = 1 + getTreeDepth(secondDict[key]) #遍历到达叶子节点,递归返回,深度加一
else:
thisDepth = 1
if thisDepth > maxDepth : maxDepth = thisDepth #三目运算符:比较当前最深,求出最大深度
return maxDepth
def retrieveTree(i):
listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTrees[i]
输出结果:
# -*- coding: utf-8 -*-
import treeplot
print(treeplot.retrieveTree(1))
mytree = treeplot.retrieveTree(0)
print treeplot.getTreeDepth(mytree)
print treeplot.getNumLeafs(mytree)
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
2
3
使用图像构造决策树:
def plotMidText(cntrpt , parentpt , txtString): #在节点之间填充信息
xMid = (parentpt[0]-cntrpt[0]) / 2.0 + cntrpt[0] #父子节点x轴的平均
yMid = (parentpt[1]-cntrpt[1])/2.0 + cntrpt[1] #父子节点y轴的平均
createPlot.ax1.text(xMid,yMid,txtString)
def plotTree(myTree, parentPt, nodeTxt):#画图主函数:
numLeafs = getNumLeafs(myTree) #求出节点个数
depth = getTreeDepth(myTree)#求出节点深度
firstStr = myTree.keys()[0] #
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#判断子节点是不是字典类型决定是否继续划分
plotTree(secondDict[key],cntrPt,str(key))
else: #画出节点
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
plotTree(inTree, (0.5,1.0), '')
plt.show()
图形输出: