写在前面的话:所有代码均是照着《机器学习实战》书上敲得,只是自己在学习过程中的一些注释仅供自己复习。只是调试成功之后的留档,不喜勿喷,可以绕道。
from math import log
import operator
import matplotlib.pyplot as plt
import pickle
#计算给定数据集的香农熵
def calcShannoEnt(dataSet):
numEntries=len(dataSet)
labelCounts={}
for featVec in dataSet:
currentLabel=featVec[-1]
if currentLabel not in labelCounts.keys():
#keys()将字典中的所有键组成一个可迭代序列并返回
#扩展:items() 函数以列表返回可遍历的(键, 值) 元组数组
# values()函数具体情况未知
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries
shannonEnt-=prob*log(prob,2)
#在python中是log(x,a),即以a为底数,求对数函数
#扩展:在python中,圆周率用math.pi表示, math.sin(math.pi/2)
# 绝对值是abs(10) 平方根是math.sqrt(4); 向上取整math.ceil(4.1),结果5; 向下取整math.floor(4.1),结果4;
# 取一组数中最小值的函数min(); 取一组数中最大值的函数max(); 自然指数函数,即e^x math.exp(0)
return shannonEnt
#按照给定的特征划分数据集
def splitDataSet(dataSet,axis,value):
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
#append() 方法用于在列表末尾添加新的对象。
#extend() 函数用于在列表末尾一次性追加另一个序列中的多个值(用新列表扩展原来的列表)
return retDataSet
#选择最好的数据集划分方式
def chooseBestFeatureToSplit(dataSet):
numFeatures=len(dataSet[0])-1
baseEntropy=calcShannoEnt(dataSet)
bestInfoGain=0.0;bestFeature=-1
for i in range(numFeatures):
featList=[example[i] for example in dataSet]
#将dataSet中的数据先按行依次放入example中,然后取得example中的example[i]元素,放入列表featList中;按行遍历
uniqueVals=set(featList)
#创建一个无序不重复元素集,可进行关系测试,删除重复数据,还可以计算交集、差集、并集等。
newEntropy=0.0
for value in uniqueVals:
subDataSet=splitDataSet(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet))
newEntropy +=prob*calcShannoEnt(subDataSet)
infoGain=baseEntropy-newEntropy
if(infoGain>bestInfoGain):
bestInfoGain=infoGain
bestFeature=i
return bestFeature
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.key():classCount[vote]=0
classCount[vote]+=1
sortedClassCount=sorted(classCount.iteritens(),key=operator.itemgetter(1),reverse=True) #降序排列
#operator模块中的itemgetter函数的作用是获取对象哪些维的数据,参数是表示维的序号。
return sortedClassCount[0][0]
#创建树的函数
def createTree(dataSet,labels):
#使用两个输入参数:数据集和标签列表,标签列表包含了数据集中所有特征的标签
classList=[example[-1]for example in dataSet]
if classList.count(classList[0])==len(classList):
#所有的类标签完全相同则直接返回该类标签
return classList[0]
if len(dataSet[0])==1:
#使用完所有的特征仍然不能将数据集划分成仅包含唯一类别的分组
return majorityCnt(classList)
bestFeat=chooseBestFeatureToSplit(dataSet)
bestFeatLabel=labels[bestFeat]
myTree={bestFeatLabel:{}}
del(labels[bestFeat])
featValues=[example[bestFeat]for example in dataSet]
uniqueVals=set(featValues)
for value in uniqueVals:
subLabels=labels[:]
#复制了类标签并将其存储在新列表变量subLabels,,,为了保证每次调用createTree时不改变原始列表的内容使用新变量subLabels代替原始列表
myTree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
decisionNode=dict(boxstyle="sawtooth",fc="0.8")
leafNode=dict(boxstyle="round4",fc="0.8")
arrow_args=dict(arrowstyle="<-")
def plotNode(nodeTxt,centerPt,parentPt,nodeType):
createPlot.ax1.annotate(nodeTxt,xy=parentPt,xycoords='axes fraction',xytext=centerPt,textcoords='axes fraction',va="center",ha="center",bbox=nodeType,arrowprops=arrow_args)
#def createPlot():
# fig=plt.figure(1,facecolor='white')
# fig.clf()
# createPlot.ax1=plt.subplot(111,frameon=False)
# plotNode('a decision node',(0.5,0.1),(0.1,0.5),decisionNode)
# plotNode('a leaf node',(0.8,0.1),(0.3,0.8),leafNode)
# plt.show()
def getNumLeafs(myTree):
numLeafs=0
firstStr=list(myTree.keys())[0]
secondDict=myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':
numLeafs+=getNumLeafs(secondDict[key])
else:numLeafs+=1
return numLeafs
def getTreeDepth(myTree):
maxDepth=0
firstStr=list(myTree.keys())[0]
secondDict=myTree[firstStr]
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':
thisDepth=1+getTreeDepth(secondDict[key])
else: thisDepth=1
if thisDepth>maxDepth:maxDepth=thisDepth
return maxDepth
def retrieveTree(i):
listOfTrees=[{'no surfacing':{0:'no',1:{'flippers':{0:'no',1:'yes'}}}},
{'no surfacing':{0:'no',1:{'flippers':{0:{'head':{0:'no',1:'yes'}},1:'no'}}}}]
return listOfTrees[i]
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0] #计算标注位置
yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString)
def plotTree(myTree, parentPt, nodeTxt):
decisionNode = dict(boxstyle="sawtooth", fc="0.8") #设置结点格式
leafNode = dict(boxstyle="round4", fc="0.8") #设置叶结点格式
numLeafs = getNumLeafs(myTree) #获取决策树叶结点数目,决定了树的宽度
depth = getTreeDepth(myTree) #获取决策树层数
firstStr =list(myTree.keys())[0] #下个字典
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff) #中心位置
plotMidText(cntrPt, parentPt, nodeTxt) #标注有向边属性值
plotNode(firstStr, cntrPt, parentPt, decisionNode) #绘制结点
secondDict = myTree[firstStr] #下一个字典,也就是继续绘制子结点
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD #y偏移
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict': #测试该结点是否为字典,如果不是字典,代表此结点为叶子结点
plotTree(secondDict[key],cntrPt,str(key))
else: #如果是叶结点,绘制叶结点,并标注有向边属性值
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
def createPlot(inTree):
fig = plt.figure(1, facecolor='white') #创建fig
fig.clf() #清空fig
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) #去掉x、y轴
plotTree.totalW = float(getNumLeafs(inTree)) #获取决策树叶结点数目
plotTree.totalD = float(getTreeDepth(inTree)) #获取决策树层数
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0; #x偏移
plotTree(inTree, (0.5,1.0), '') #绘制决策树
plt.show()
def classify(inputTree, featLabels, testVec):
firstStr =list(inputTree.keys())[0] #获取决策树结点
secondDict = inputTree[firstStr] #下一个字典
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__ == 'dict':
classLabel = classify(secondDict[key], featLabels, testVec)
else: classLabel = secondDict[key]
return classLabel
def storeTree(inputTree, filename):
with open(filename, 'wb') as fw:
pickle.dump(inputTree, fw)
def grabTree(filename):
fr = open(filename, 'rb')
return pickle.load(fr)
def createDataSet():
dataSet=[[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
labels=['no surfacing','flippers']
return dataSet,labels
if __name__ == '__main__':
#dataSet, labels = createDataSet()
#featLabels = []
#myTree = createTree(dataSet, labels)
#createPlot()
#retrieveTree(1)
#myTree=retrieveTree(0)
#createPlot(myTree)
#getNumLeafs(myTree)
#getTreeDepth(myTree)
#classify(myTree,labels,[1,0])
#storeTree(myTree,'classifierStorage.txt')
#grabTree('classifierStorage.txt')
fr=open('lenses.txt')
lenses=[inst.strip().split('\t')for inst in fr.readlines()]
lensesLables=['age','prescript','astigmatic','tearRate']
lensesTree=createTree(lenses,lensesLables)
createPlot(lensesTree)