#书上用的ID3算法,选用信息增益做为选取特征的依据。现在把基于信息增益率的C45和CART(基尼指数)实现一下
#计算信息熵 calcShannoEnt
#提取划分后的子集 splitDataSet
#找最优划分属性 chooseBestFeatureToSplit
#计算出现次数最多的类,作为该集合的类 majorityCnt
#构建决策树 createTree
# 计算calcShannoEnt
from math import log
def calcShannoEnt(dataset):
labelCount={} #字典的标签为Y1,Y2,...,Yn;对应元素为Yi的个数
for featvec in dataset:
currentlLabel=featvec[-1]
if currentlLabel not in labelCount.key():
labelCount[currentlLabel]=0
labelCount[currentlLabel]+=1
shannonEnt=0.0
numEntries=len(dataset)
for key in labelCount:
prob=float(labelCount[key])/numEntries
shannonEnt-=prob*log(prob,2) #Ent=-p1*log(2,p1)-...-pn*log(2,pn)
return shannonEnt
#提取划分后的子集 splitDataSet
def splitDataSet(dataset,axis,value):
retDataset={}
for featvec in dataset:
if featvec[axis]==value:
reducedfeatvec=featvec[:axis]
reducedfeatvec.extend(featvec[axis+1:])
retDataset.append(reducedfeatvec) #将数据集划分成k个子节点
return retDataset
#找最优划分属性 chooseBestFeatureToSplit
def chooseBestFeatureToSplit():
bestgain=0.0
bestfeature=-1
bestEntropy=calcShannoEnt(dataset)
numFeature=len(dataset[0])-1
for i in range(numFeature):
featlist=[example[i] for example in dataset]
uniqueval=set(featlist)
newEntropy=0.0
for value in uniqueval:
subdataset=splitDataSet(dataset,i,value)
di=len(subdataset)/float(len(dataset))
newEntropy+=di*calcShannoEnt(subdataset)
gain=bestEntropy-newEntropy #算出第i个特征的信息增益
if gain>bestgain: #比较每个特征的信息增益大小,选出gain(D,a)最大的Xi
bestgain=gain
bestfeature=i
return bestfeature
#计算出现次数最多的类,作为该集合的类 majorityCnt
def majorityCnt(labels):
classlab={}
for lab in labels:
if lab not in classlab.keys():
classlab[lab]=0
classlab[lab]+=1
sorted=sorted(classlab.iteritems(),key=operator.itemgetter(1) ,reverse=True)
return sorted[0][0]
#构建决策树 createTree
def createTree(dataset,labels):
classlist=[example[-1] for example in dataset]
if classlist.count(classlist[0])==len(classlist): #节点中样本属于同一类,不用划分了
return classlist[0]
if len(dataset[0])==1: #只剩一个样本,没法划分了
return majorityCnt(classlist)
bestfeat=chooseBestFeatureToSplit(dataset) #决策树的主要部分
bestfeatlabel=labels[bestfeat]
myTree={bestfeatlabel:{}} #每个划分属性(字典的key)都存储了一个小分支(也是字典)
del(labels[bestfeat]) #已经用xi做为分类标签,以后的子树将不再使用。连续X则还可用
featvalues=[example[bestfeat] for example in dataset]
unique=set(featvalues)
for value in unique:
sublabel=label[:]
myTree[bestfeatlabel][value]=createTree(splitDataSet(dataset,bestfeat,value),sublabels) #嵌套
return myTree
####C45和ID3的区别
#找最优划分属性 chooseBestFeatureToSplit 以信息增益率做标准
def chooseBestFeatureToSplit():
bestgain=0.0
bestfeature=-1
bestEntropy=calcShannoEnt(dataset)
numFeature=len(dataset[0])-1
for i in range(numFeature):
featlist=[example[i] for example in dataset]
uniqueval=set(featlist)
newEntropy=0.0
IV=0.0 #与ID3的不同之处,用增益率
for value in uniqueval:
subdataset=splitDataSet(dataset,i,value)
di=len(subdataset)/float(len(dataset))
newEntropy+=di*calcShannoEnt(subdataset)
IV-=di*log(2,di)
gain=bestEntropy-newEntropy
Gain_ratio=gain/IV
if Gain_ratio>bestgain:
bestgain=Gain_ratio
bestfeature=i
return bestfeature #gain_ratio=gain/IV
####CART用基尼指数作为标准
#计算Gini 基尼值
#提取划分后的子集 splitDataSet
#找最优划分属性 chooseBestFeatureToSplit
#计算出现次数最多的类,作为该集合的类 majorityCnt
#构建决策树 createTree
# 计算Gini 基尼值
from math import log
def Gini(dataset):
labelCount={}
for featvec in dataset: #计算集合中yi出现次数ni
currentlLabel=featvec[-1]
if currentlLabel not in labelCount.key():
labelCount[currentlLabel]=0
labelCount[currentlLabel]+=1
Gini=0.0
numEntries=len(dataset)
for key in labelCount:
prob=float(labelCount[key])/numEntries
formal_Gini+=prob^2
Gini=1-formal_Gini
return Gini
#提取划分后的子集 splitDataSet
def splitDataSet(dataset,axis,value):
retDataset={}
for featvec in dataset:
if featvec[axis]==value:
reducedfeatvec=featvec[:axis]
reducedfeatvec.extend(featvec[axis+1:])
retDataset.append(reducedfeatvec)
return retDataset
#找最优划分属性 chooseBestFeatureToSplit #基尼指数
def chooseBestFeatureToSplit():
minGini=1 # 注意:基尼指数小于等于一
bestfeature=-1
numFeature=len(dataset[0])-1
for i in range(numFeature):
featlist=[example[i] for example in dataset]
uniqueval=set(featlist)
for value in uniqueval:
subdataset=splitDataSet(dataset,i,value)
di=len(subdataset)/float(len(dataset))
Gini_index+=di*Gini(subdataset) #计算基尼指数
if Gini_index < minGini: #选基尼指数最小的属性作为划分属性
minGini=Gini_index
bestfeature=i
return bestfeature
决策树构造同ID3