kNN算法简单有效,但是不能给出任何数据的基础结构信息,因此使用决策树方法处理分类问题,决策树的优势在于数据形式非常容易理解。
在构造决策树的时候,重要的一步就是选取特征作为节点,为了找到决定性的特征,划分出最好的结果,我们就要评估每个特征,评价指标是信息增益和熵,这两个概念可以参考这篇博客(http://m.blog.csdn.net/article/details?id=40736757),通俗易懂。
构造决策树的步骤如下:
- 按照每个特征进行划分,求出每种划分的信息增益
- 增益最大的特征即作为本次的划分节点
- 按照这个特征对数据集进行划分,得到划分后的子集
- 对子集递归使用步骤1-3
- 满足结束条件后得到决策树
代码实现
from math import log
import operator
def claShang(dataSet): #计算香浓熵
numEntries=len(dataSet)
labelCounts={}
for featVec in dataSet:
currentLabel=featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt=0.0
for key in labelCounts:
prob=float(labelCounts[key])/numEntries
shannonEnt-=prob*log(prob,2)
return shannonEnt
dataSet=[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
labels=['no surfacing','flippers']
def splitdataset(dataSet,axis,value):#按照给定特征划分数据集
retdata=[]
for featVec in dataSet:
if featVec[axis]==value:
reduceFeatvec=featVec[:axis]
reduceFeatvec.extend(featVec[axis+1:])
retdata.append(reduceFeatvec)
return retdata
def choosBest(dataSet): #选择最重要的特征
numFeatures=len(dataSet[0])-1
baseEntropy=claShang(dataSet)
bestInfoGain=0.0
bestFeature=-1
for i in range(numFeatures):
feaList=[example[i] for example in dataSet]
uniqueVals=set(feaList)
newEntropy=0.0
for value in uniqueVals:
subDataSet=splitdataset(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet))
temp=claShang(subDataSet)
newEntropy+=prob*temp
infogain=baseEntropy-newEntropy
if(infogain>bestInfoGain):
bestInfoGain=infogain
bestFeature=i
return bestFeature
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reversed=True)
return sortedClassCount[0][0]
def creatTree(dataSet,labels):
classList=[example[-1] for example in dataSet]
if classList.count(classList[0])==len(classList):
return classList[0]
if len(dataSet[0])==1:
return majorityCnt(classList)
bestFest=choosBest(dataSet)
bestLabel=labels[bestFest]
mytree={bestLabel:{}}
featvalue=[example[bestFest] for example in dataSet]
uniquevals=set(featvalue)
for value in uniquevals:
sublabels=labels[:]
del(sublabels[bestFest])
mytree[bestLabel][value]=creatTree(splitdataset(dataSet,bestFest,value),sublabels) #递归构造子节点
return mytree
mytree=creatTree(dataSet,labels)
print(mytree)