本系列以书中源码为主,稍作修改并添加注释,均实际运行可行。为免后来者踩坑,特此公开!欢迎打赏!
转载请注明出处!
from math import log
import operator
import matplotlib.pyplot as plt
def calcShannonEnt(dataSet): #定义计算信息熵函数
numEntries = len(dataSet) #获得数据集长度
labelCounts ={} #创建符号类别字典
for featVec in dataSet: #对不同符号计数
currentLabel = featVec[-1] #获取每条数据集符号类别
if currentLabel not in labelCounts.keys(): #初始化符号个数为0
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1 #对当前符号计数加1
shannonEnt = 0.0 #初始化香农熵为0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries #对字典中的不同符号计算其出现概率
shannonEnt -= prob * log(prob,2) #计算香农熵
return shannonEnt
def createDataSet():
dataSet =[[1,1,'yes'],
[1,1,'yes'],
[0,1,'no'],
[0,1,'no'] ]
labels =['no surfacing','flippers']
return dataSet,labels
myDat,labels=createDataSet()
calcShannonEnt(myDat)
def splitDataSet(dataSet,axis,value):#按照给定特征划分数据集,axis为特征,value为特征对应的特征值
retDataSet =[] #声明新的列表对象存储修改后的数据集
for featVec in dataSet: #遍历原数据集每行
if featVec[axis] == value: #选取满足对应特征的特征值要求的行数据
reducedFeatVec = featVec[:axis] #抽离该行特征
reducedFeatVec .extend(featVec[axis+1:])#同上
retDataSet.append(reducedFeatVec)#复制到新的列表中
return retDataSet
def chooseBestFeatureToSplit(dataSet):#根据ID3算法选择最佳特征进行分割
numFeatures = len(dataSet[0]) - 1 #计算一共有多少个特征
baseEntropy = calcShannonEnt(dataSet)#计算总体样本集的信息熵
bestInfoGain = 0.0;bestFeature = -1#初始化最佳信息增益,最佳划分特征
for i in range(numFeatures): #迭代所有的特征,在一次for循环之内,遍历一个特征所对应的值,并统计每个特征值对应样本的输量
featList = [example[i] for example in dataSet] #将第i个特征的所有取值提取出来
uniqueVals = set(featList) #消除重复元素
newEntropy = 0.0 #初始化新的信息熵
for value in uniqueVals: #计算第i个特征的信息熵
subDataSet = splitDataSet(dataSet,i,value) #重新划分数据集
prob = len(subDataSet) /float(len(dataSet)) #极大似然估计概率
newEntropy += prob * calcShannonEnt(subDataSet) #累加第i个特征取不同值的信息熵
infoGain = baseEntropy - newEntropy #计算信息增益
if(infoGain > bestInfoGain): #迭代取得最大信息增益及对应特征
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList): #对类标签进行投票,找标签数目最多的标签
classCount={} #定义标签元字典,key为标签,value为标签的数目
for vote in classList: #遍历所有标签,并计数
if vote not in classCount.keys():classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0] #排序后返回数目最多的标签,key=operator.itemgetter(1)根据第一个参数排序
def createTree(dataSet,labels): #创建决策树
classList = [example[-1] for example in dataSet] #提取各行数据的类别
if classList.count(classList[0]) == len(classList):#判断所有数据是否属于同一标签,是则返回
return classList[0]
if len(dataSet[0]) == 1: #判断特征是否只有一个,若是则对该集合进行投票选择最大类
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)#提取增益熵最大的特征所在的下标
bestFeatLabel = labels[bestFeat]#提取该特征
myTree = {bestFeatLabel:{}} #建立树的字典结构
del(labels[bestFeat])#去除已经选择的特征,以便下次递归
featValues = [example[bestFeat] for example in dataSet]#提取增益熵最大的特征所有值
uniqueVals = set(featValues)#去除重复的特征值
for value in uniqueVals:#遍历各值,建立递归树
subLabels = labels[:]#更新labels
myTree[bestFeatLabel][value]= createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
'''
#使用文本注解绘制树节点
decisionNode = dict(boxstyle="sawtooth",fc="0.8") #boxstyle为文本框的类型,sawtooth是锯齿形,fc是边框线粗细
leafNode = dict(boxstyle="round4",fc="0.8")#定义决策树的叶子节点的描述属性
arrow_args = dict(arrowstyle="<-")#定义决策树的箭头属性
def plotNode(nodeTxt,centerPt,parentPt,nodeType):#绘制节点
createPlot.axl.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\
xytext=centerPt,textcoords='axes fraction',va="center",ha="center",bbox=nodeType,arrowprops=arrow_args)
def createPlot():
fig =plt.figure(1,figsize=(12,9),facecolor='white')
fig.clf()
createPlot.axl = plt.subplot(111,frameon=False)
plotNode('决策节点',(0.5,0.1),(0.1,0.5),decisionNode)
plotNode('叶节点',(0.8,0.1),(0.3,0.8),leafNode)
plt.show()
createPlot()
'''
#获取叶节点的数目和树的层数
def getNumLeafs(myTree):
numLeafs = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
numLeafs += getNumLeafs(secondDict[key])
else: numLeafs +=1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
thisDepth = 1 + getTreeDepth(secondDict[key])
else: thisDepth = 1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth
#更新绘制树形图函数
decisionNode = dict(boxstyle="sawtooth",fc="0.8") #boxstyle为文本框的类型,sawtooth是锯齿形,fc是边框线粗细
leafNode = dict(boxstyle="round4",fc="0.8")#定义决策树的叶子节点的描述属性
arrow_args = dict(arrowstyle="<-")#定义决策树的箭头属性
def plotNode(nodeTxt,centerPt,parentPt,nodeType):#绘制节点
createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',\
xytext=centerPt,textcoords='axes fraction',va="center",ha="center",bbox=nodeType,arrowprops=arrow_args)
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
numLeafs = getNumLeafs(myTree) #this determines the x width of this tree
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0] #the text label for this node should be this
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':#test to see if the nodes are dictonaires, if not they are leaf nodes
plotTree(secondDict[key],cntrPt,str(key)) #recursion
else: #it's a leaf node print the leaf node
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#if you do get a dictonary you know it's a tree, and the first element will be another dict
def createPlot(inTree):
fig = plt.figure(1,figsize=(8,8) ,facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) #no ticks
#createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
plotTree(inTree, (0.5,1.0), '')
plt.show()
#创建树结构测试上2函数
def retrieveTree(i):
listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
{'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
]
return listOfTrees[i]
myTree=retrieveTree(0)
myTree['no surfacing'][3]='maybe'
#使用决策树的分类函数
def classify(inputTree,featLabels,testVec):#参数列表:决策树模型,特征向量,测试向量
firstStr = inputTree.keys()[0] #获取决策树第一个特征
secondDict = inputTree[firstStr] #获取字典(决策树)的第一个值,即另一个字典(决策树)
featIndex = featLabels.index(firstStr)#获取决策树第一个特征在特征列表中的索引
key = testVec[featIndex]#获取该特征在测试数据中对应的值
valueOfFeat = secondDict[key]#根据测试数据的特征值返回其在子决策树对应的值,也可能是另一棵子决策树
if isinstance(valueOfFeat, dict): #判断返回值是否为字典(决策树)
classLabel = classify(valueOfFeat, featLabels, testVec)#递归迭代直到做出决策为止
else: classLabel = valueOfFeat
return classLabel
#使用pickle模块存储决策树
def storeTree(inputTree,filename):
import pickle
fw = open(filename,'w')
pickle.dump(inputTree,fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
#实例,使用决策树预测隐形眼镜类型
fr=open(r'选择你自己的数据集存储路径\lenses.txt')
lenses=[inst.strip().split('\t') for inst in fr.readlines()]#清洗数据,去除首尾空格,以tab键对数据切片并返回列表
lensesLabels=['age','prescript','astigmatic','tearRate']
lensesTree=createTree(lenses,lensesLabels)
createPlot(lensesTree)