'''
Decision Tree Source Code for Machine Learning in Action Ch. 3
'''from math import log
import operator
defcreateDataSet():
dataSet =[[1,1,'yes'],[1,1,'yes'],[1,0,'no'],[0,1,'no'],[0,1,'no']]
labels =['no surfacing','flippers']#change to discrete valuesreturn dataSet, labels
"""
func:计算给定数据集的熵
param:
dataset: 数据集;
return:
shannonEnt: 给定数据集的熵
"""defcalcShannonEnt(dataSet):
numEntries =len(dataSet)
labelCounts ={
}for featVec in dataSet:#the the number of unique elements and their occurance# 当前标签为特征向量的最后一个特征,即类别标签
currentLabel = featVec[-1]# 计算该特征的值对应的出现的次数if currentLabel notin labelCounts.keys(): labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannonEnt =0.0for key in labelCounts:
prob =float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob,2)#log base 2return shannonEnt
# ; """
func:按照给定特征划分数据集
param:
dataset: 待划分的数据集;
axis:划分数据集的特征;
value: 划分数据集的特征的值;
return:
retDataSet: 划分后的数据集,即特征axis等于给定value,且不包括特征axis列的数据集;
"""defsplitDataSet(dataSet, axis, value):# 目的:防止修改原始数据集
retDataSet =[]for featVec in dataSet:if featVec[axis]== value:
reducedFeatVec = featVec[:axis]#chop out axis used for splitting
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)return retDataSet
"""
func:选择最好的数据集划分方式
param:
dataset:待划分的数据集
return:
bestFeature:信息增益最大的特征,注意这里为该特征所在标签列表的索引;
"""defchooseBestFeatureToSplit(dataSet):
numFeatures =len(dataSet[0])-1#the last column is used for the labels# 计算经验熵 P(D)
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain =0.0; bestFeature =-1for i inrange(numFeatures):#iterate over all the features# 创建唯一的分类标签列表
featList =[example[i]for example in dataSet]#create a list of all the examples of this feature
uniqueVals =set