1.ID3算法
我们要对这样一组数据来构建一棵决策树来预测在{天气=晴,温度=适中,湿度=正常,风速=弱}的情况下,是否适合活动。
信息熵为:
E
n
t
(
′
活
动
′
)
=
−
9
14
l
o
g
2
9
14
−
5
14
l
o
g
2
5
14
=
0.9402859586706309
Ent('活动')=-{\frac{9}{14}log_{2}\frac{9}{14}}-\frac{5}{14}log_{2}\frac{5}{14}=0.9402859586706309
Ent(′活动′)=−149log2149−145log2145=0.9402859586706309
信息增益的计算:
I
n
f
o
′
天
气
′
(
′
活
动
′
)
=
5
14
i
n
f
o
′
晴
′
(
′
活
动
′
)
+
4
14
i
n
f
o
′
雨
′
(
′
活
动
′
)
+
5
14
i
n
f
o
′
阴
′
(
′
活
动
′
)
=
5
14
(
−
3
5
l
o
g
2
3
5
−
2
5
l
o
g
2
2
5
)
+
4
14
(
−
4
4
l
o
g
2
4
4
−
0
0
l
o
g
2
0
0
)
+
5
14
(
−
2
5
l
o
g
2
3
5
−
3
5
l
o
g
2
3
5
)
=
0.6935361388961918
Info_{'天气'}('活动')=\frac{5}{14}info_{'晴'}('活动')+\frac{4}{14}info_{'雨'}('活动')+\frac{5}{14}info_{'阴'}('活动')=\frac{5}{14}(-\frac{3}{5}log_{2}\frac{3}{5}-\frac{2}{5}log_{2}\frac{2}{5})+\frac{4}{14}(-\frac{4}{4}log_{2}\frac{4}{4}-\frac{0}{0}log_{2}\frac{0}{0})+\frac{5}{14}(-\frac{2}{5}log_{2}\frac{3}{5}-\frac{3}{5}log_{2}\frac{3}{5})=0.6935361388961918
Info′天气′(′活动′)=145info′晴′(′活动′)+144info′雨′(′活动′)+145info′阴′(′活动′)=145(−53log253−52log252)+144(−44log244−00log200)+145(−52log253−53log253)=0.6935361388961918
信息增益为:Gain(‘天气’)=Ent(‘活动’)-
i
n
f
o
I
n
f
o
′
天
气
′
(
′
活
动
′
)
=
0.2467498197744391
infoInfo_{'天气'}('活动')=0.2467498197744391
infoInfo′天气′(′活动′)=0.2467498197744391
同理可以算出剩余三项的条件熵和信息增益分别为
i n f o A ( D ) info_{A}(D) infoA(D) | Gain(A) |
---|---|
0.9110633930116763 | 0.029222565658954647 |
0.7884504573082896 | 0.15183550136234136 |
0.8921589282623617 | 0.04812703040826927 |
由此可以看出天气的信息增益最大,所以选择天气作为最优解,接下来数据集根据天气的不同被划分了三个部分:
紧接着对每个子数据集重复上面的步骤,构造出对应的子决策树。
from math import log
import operator
import pickle
dataSet = [['sunny','hot','high','weak','no'],
['sunny','hot','high','strong','no'],
['overcast','hot','high','weak','yes'],
['rain','mild','high','weak','yes'],
['rain','cool','normal','weak','yes'],
['rain','cool','normal','strong','no'],
['overcast','cool','normal','strong','yes'],
['sunny','mild','high','weak','no'],
['sunny','cool','normal','weak','yes'],
['rain','mild','normal','weak','yes'],
['sunny','mild','normal','strong','yes'],
['overcast','mild','high','strong','yes'],
['overcast','hot','normal','weak','yes'],
['rain','mild','high','strong','no']]
labels = ['outlook','temperature','humidity','wind']
featlist=[number[0] for number in dataSet]
uniquelVals=set(featlist)
print(uniquelVals)
{'overcast', 'rain', 'sunny'}
def calcShanonEnt(dataset):#计算信息熵
numSamples=len(dataset)
labelCounts={}
for fearture in dataset:#统计最后一列各标签的数目
currentLabel=fearture[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
#print(labelCounts)
entropy=0
#计算信息熵
for key in labelCounts:
p=float(labelCounts[key])/numSamples
entropy-=p*log(p,2)
return entropy
calcShanonEnt(dataSet)
0.9402859586706309
def splitDataSet(dataSet,axis,value):
retDataSet=[]
for feat in dataSet:
if feat[axis]==value:
reduceFeat=feat[:axis]
reduceFeat.extend(feat[axis+1:])
retDataSet.append(reduceFeat)
return retDataSet #返回不含划分特征的子集
splitDataSet(dataSet,0,'rain')
[['mild', 'high', 'weak', 'yes'],
['cool', 'normal', 'weak', 'yes'],
['cool', 'normal', 'strong', 'no'],
['mild', 'normal', 'weak', 'yes'],
['mild', 'high', 'strong', 'no']]
def chooseBestFeatureToSplit(dataSet):
numberFeature=len(dataSet[0])-1#不选择标签列
baseEntropy=calcShanonEnt(dataSet)#计算最后一列信息熵
bestInforGain=0
bestFeature=-1
for i in range(numberFeature):
featlist=[number[i] for number in dataSet]
uniquelVals=set(featlist)
#set 得到无重复的属性 {'rain', 'sunny', 'overcast'}
newEntropy=0
for vals in uniquelVals:
subDataSet=splitDataSet(dataSet,i,vals)
prob=len(subDataSet)/float(len(dataSet))
newEntropy+=prob*calcShanonEnt(subDataSet)#计算条件增益
inforGain=baseEntropy-newEntropy
if(inforGain>bestInforGain):
bestInforGain=inforGain
bestFeature=i
return bestFeature
#创建递归树,用于找出出现次数最多的分类名称
def majorityCnt(classList):
classCount={}
for vote in classList:#统计各特征值出现的次数
if vote not in classCount.keys():
classCount[vote]=0
classCount[vote]+=1
#对特征值出现的次数进行排序
sortedClasscount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClasscount[0][0]
def creatTree(dataSet,labels):
#存放最后一列
classList=[example[-1] for example in dataSet]
#count 统计某个元素出行的次数
if classList.count(classList[0])==len(classList):
return classList[-1]
#当class里面全是一类是停止分割(即全为yes或者no的时候)
if len(dataSet[0])==1: #当没有更多特征时停止分割,即分到最后一个特征也没有把数据完全分开,就返回多数的那个结果
return majorityCnt(classList)
#选出最优特征
bestFeat=chooseBestFeatureToSplit(dataSet)
bestFeatLable=labels[bestFeat]
#从标签中取出最优特征对应的标签
myTree={bestFeatLable:{}}
del (labels[bestFeat])#删除已经找到的特征标签
featValues=[example[bestFeat] for example in dataSet]
#对特征标签每个值对应的样本进行划分
uniquelVals=set(featValues)
for value in uniquelVals:
subLables=labels[:]
#对每个划分出的子样本进行决策树构建
subDataSet=splitDataSet(dataSet,bestFeat,value)
myTree[bestFeatLable][value]=creatTree(subDataSet,subLables)
return myTree
mytree=creatTree(dataSet,labels)
mytree
{'outlook': {'overcast': 'yes',
'rain': {'wind': {'strong': 'no', 'weak': 'yes'}},
'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}
labels = ['outlook','temperature','humidity','wind']
featlabels=labels[:]
testFeatValue=['rain','hot','high','weak']
def classify(tree,featlabels,testFeatValue):
firstStr=list(tree.keys())[0]#outlook
secondDict=tree[firstStr]#{'rain': {'wind': {'strong': 'no', 'weak': 'yes'}},
# 'sunny': {'humidity': {'normal': 'yes', 'high': 'no'}}, 'overcast': 'yes'}
featIndex=featlabels.index(firstStr)# 0 找出第一个标签的索引的位置
# print(firstStr)
# print(secondDict)
# print(featIndex)
for firstStr_value in secondDict.keys():
if(testFeatValue[featIndex]==firstStr_value):
#如果找到了对应分支,如果还有子树,则继续递归寻找
if(type(secondDict[firstStr_value]).__name__=='dict'):##这里是两个 _
classLabel=classify(secondDict[firstStr_value],featlabels,testFeatValue)
else:
classLabel=secondDict[firstStr_value]
return classLabel
classify=classify(mytree,featlabels,testFeatValue)
print(classify)
yes
tinydict = {'Name': 'Zara', 'Age': 7}
tinydict.keys()
type(tinydict).__name__
'dict'
C4.5
C4.5是ID3算法的一个改进,他是基于增益率来选择最优特征的,而计算增益率则要在ID3的基础上求解特征熵,所以我们的C4.5 仅需在ID3的求信息熵函数和选择最优特征的函数上做些修改即可。
import operator
from math import log
dataSet = [['sunny','hot','high','weak','no'],
['sunny','hot','high','strong','no'],
['overcast','hot','high','weak','yes'],
['rain','mild','high','weak','yes'],
['rain','cool','normal','weak','yes'],
['rain','cool','normal','strong','no'],
['overcast','cool','normal','strong','yes'],
['sunny','mild','high','weak','no'],
['sunny','cool','normal','weak','yes'],
['rain','mild','normal','weak','yes'],
['sunny','mild','normal','strong','yes'],
['overcast','mild','high','strong','yes'],
['overcast','hot','normal','weak','yes'],
['rain','mild','high','strong','no']]
labels = ['outlook','temperature','humidity','wind']
classList=[example[-1] for example in dataSet]
print(dataSet[0])
len(dataSet[0])
['sunny', 'hot', 'high', 'weak', 'no']
5
#不仅可以求解信息熵,还可以求解每个特征的特征熵
def clacShanonEntFeature(dataSet,i):
numbers=len(dataSet)
labelCounts={}
for feature in dataSet:
currentlabel=feature[i]
if currentlabel not in labelCounts.keys():
labelCounts[currentlabel]=0
labelCounts[currentlabel]+=1
#统统不同特征取值对应的个数
entropy=0
for key in labelCounts:
p=float(labelCounts[key])/numbers
entropy-=p*log(p,2)
return entropy
clacShanonEntFeature(dataSet,0)
1.5774062828523452
def splitDataSet(dataSet,axis,value):
Redataset=[]
for feature in dataSet:
if(feature[axis]==value):
reduceset=feature[:axis]
reduceset.extend(feature[axis+1:])
Redataset.append(reduceset)
return Redataset
splitDataSet(dataSet[:],0,'rain')
#分割不含最优特征的子数据集
[['mild', 'high', 'weak', 'yes'],
['cool', 'normal', 'weak', 'yes'],
['cool', 'normal', 'strong', 'no'],
['mild', 'normal', 'weak', 'yes'],
['mild', 'high', 'strong', 'no']]
def chooseBestFeature(Dataset):
baseEntropy=clacShanonEntFeature(dataSet,-1)
bestFeature=-1
bestGrainrate=0
numbersFeature=len(Dataset[0])-1
for i in range(numbersFeature):
featurelist=[example[i] for example in dataSet]
uniquefeature=set(featurelist)
newEntropy=0
for uniquevalue in uniquefeature:
#求各个特征条件熵
subDataSet=splitDataSet(dataSet,i,uniquevalue)
p=float(len(subDataSet))/len(Dataset)
newEntropy+=p*clacShanonEntFeature(subDataSet,-1)
infroGain=baseEntropy-newEntropy#求增益
splitInfo=clacShanonEntFeature(dataSet,i)
if(splitInfo==0):#如果分母为0跳过
continue
else:
Grainrate=infroGain/splitInfo#求解增益率
if(Grainrate>bestGrainrate):
bestGrainrate=Grainrate
bestFeature=i
return bestFeature
#dataSet
chooseBestFeature(dataSet)
0
def majorityCnt(classList):
classCount={}
for vote in classList:
if vote not in classCount.keys():
classCount[vote]=0
else:
classCount[vote]+=1
sortedClasscount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClasscount[0][0]
def creatTree(dataSet,labels):
classList=[example[-1] for example in dataSet]
if(classList.count(classList[0])==len(classList)):
return classList[0]
#如果样本输出类别为同一类Di,直接返回Di
if(len(dataSet[0])==1):
return majorityCnt(classList)
#如果样本特征为空,则返回类别为样本中输出类别D实列数最多的类别
bestFeature=chooseBestFeature(dataSet)#选出最优特征
bestLabel=labels[bestFeature]
mytree={bestLabel:{}}
del(labels[bestFeature])#删除一选出的特征标签
featurevalues=[example[bestFeature] for example in dataSet]
uniquevalues=set(featurevalues)#求出每个特征可以取得值
for values in uniquevalues:
sublabels=labels[:]#对每个子数据集进行递归求决策树
subDataset=splitDataSet(dataSet,bestFeature,values)
mytree[bestLabel][values]=creatTree(subDataset,sublabels)
return mytree
creatTree(dataSet,labels)
{'outlook': {'rain': {'wind': {'no': 'no', 'yes': 'yes'}},
'sunny': {'wind': {'no': 'no', 'yes': 'yes'}},
'overcast': 'yes'}}