这一部分主要是对决策树代码理解的,读者可根据代码后面的注释进行理解。复制后可直接运行,后面的可视化、实例会单独发出来。
from math import log
#计算给定数据集的香农熵
def calcshannonent(dataset):
numentries = len(dataset)
labelcounts = {}
for featvec in dataset:
currentlabel = featvec[-1]
if currentlabel not in labelcounts.keys():
labelcounts[currentlabel] = 0
labelcounts[currentlabel] +=1
shannonent = 0.0
for key in labelcounts:
prob = float(labelcounts[key])/numentries
shannonent -= prob * log(prob,2)
return shannonent
#简单的测试集
def createdataset():
dataset = [[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']]
label = ['no surfacing','flipper']
return dataset,label
# mydat,labels = createdataset()
# calcshannonent(mydat)
# 0.9287712379549449
# #熵越高,混合的数据越多
# mydat[0][-1] = 'maybe'
# mydat
# [[1, 1, 'maybe'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']]
# calcshannonent(mydat)
# 1.3931568569324173
#按照给定特征划分数据集
def splitdataset(dataset,axis,value):
#dataset:数据集,axis:给划分数据集的特征,value:需返回的特征的值
retdataset = []
for featvec in dataset:
if featvec[axis] == value:
reducefeatvec = featvec[:axis]
reducefeatvec.extend(featvec[axis+1:])
retdataset.append(reducefeatvec)
return retdataset
#选择最好的数据集划分方式
def choosebestfeaturetosplit(dataset):
numfeatures = len(dataset[0]) - 1
baseentropy = calcshannonent(dataset)
bestinfogain = 0.0;bestfeature = -1
for i in range(numfeatures):
featlist = [example[i] for example in dataset] #将数据集dataset中第i列特征值写入新的list中
uniquevals = set(featlist) #创建列表无重复元素值,且唯一属性
newentropy = 0.0
for value in uniquevals:
subdataset = splitdataset(dataset,i ,value)
prob = len(subdataset)/float(len(dataset))
newentropy +=prob*calcshannonent(subdataset)
infogain = baseentropy - newentropy
if(infogain > bestinfogain):
bestinfogain = infogain
bestfeature = i
return bestfeature
#数据集已处理所有属性,但类标签依然不唯一,采取多数表决来决定叶子节点的分类
def majoritycnt(classlist):
classcount = {}
for vote in classlist:#遍历所有行,标上所有类别出现的次数。这和KNN近邻算法中的classify0()非常类似
if vote not in classcount.key():classcount[vote] = 0
classcount[vote] +=1
sortedclasscount = sorted(classcount.items(),key = operator.itengetter(1),reverse = True)
return sortedclasscount[0][0]#返回次数最多的类别
# 创建数的函数代码
def createtree(dataset,labels):
classlist = [example[-1] for example in dataset] #这里的意思是将dataset中的类标签数据作为一个新列表储存在classlist中
if classlist.count(classlist[0]) == len(classlist):#判断classlist中类标签是否相同
return classlist[0]
if len(dataset[0]) == 1:#数据集划分包含多个类别的分组时,调用majoritycnt(),挑选次数最多的类别
return majoritycnt(classlist)
bestfeat = choosebestfeaturetosplit(dataset)#选择最好的特征进行分类
bestfeatlabel = labels[bestfeat]#最好的特征对应的类别
mytree = {bestfeatlabel:{}}#使用字典变量mytree存储树的信息,
#del用法比较特殊,del语句作用在变量上,而不是数据对象上,意思就是del删除的是变量,而不是数据
del(labels[bestfeat])
featvalues = [example[bestfeat] for example in dataset]
uniquevals = set(featvalues)
#上面两行得到最好特征的所有属性值的列表
for value in uniquevals:
#解释sublabels = labels[:]--在python语言中函数参数是列表类型时,参数是按照引用方式传递的,为确保每次调
#用createtree()时不改变原始列表的内容,使用新变量sublabels代替原始列表
sublabels = labels[:]
mytree[bestfeatlabel][value] = createtree(splitdataset(dataset,bestfeat,value),sublabels)
return mytree
#主代码测试
mydat,labels = createdataset()
mytree = createtree(mydat,labels)
mytree
# {'flipper': {0: 'no', 1: {'no surfacing': {0: 'no', 1: 'yes'}}}}