划分依据
决策树的主要依据为信息熵计算,信息熵最大的最为分类依据流程
创建数据集 –> 计算信息熵,最大值作为结点,划分子数据集 –> 递归寻找代码
from math import log
import operator
'''
机器学习实战-第三章(决策树)
'''
# 创建数据集
def createDataSet():
dataset = [
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']
]
labels = ['good', 'bad']
return dataset, labels
# 计算香农熵
def calcShannonEnt(dataset):
numEntries = len(dataset)
labelsCount = {} # 字典相当于java中的map
for featVec in dataset:
currentLabel = featVec[-1]
if currentLabel not in labelsCount.keys():
labelsCount[currentLabel] = 1
else:
labelsCount[currentLabel] += 1
shannonEnt = 0.0
for key in labelsCount:
prop = labelsCount[key] / numEntries
shannonEnt -= prop * log(prop, 2)
return shannonEnt
# 划分数据集
# 筛选出第axis个特征的值为value的项,同时删除次特征列
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featureVec in dataSet:
if featureVec[axis] == value:
reducedFeatVec = featureVec[:axis]
reducedFeatVec.extend(featureVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
# 补充:python中,可变的为引用,需要创建副本如列表;不可变的为值传递,如元组
# append和extend的区别,append是将后面一个作为整体一个加入,extend是将后面一个拆开,和之前的元组类型一样的
# 计算每一个特征值所对应的信息熵,选出最大的信息熵
def chooseBestFeatureToSplit(dataSet):
numFeature = len(dataSet[0]) - 1 # 特征数
bestInfoGain = 0.0;
bestFeatureIndex = -1; # 最大的信息增益和所在的特征列,下标
# 分别对每一列特征进行熵计算(i)
for i in range(numFeature):
featureList = [feature[0] for feature in dataSet]
featureSet = set(featureList) # 将list转化为set集合,提取出每一列的特征项(不重复)
for value in featureSet:
subDataSet = splitDataSet(dataSet, i, value)
prop = len(subDataSet) / float(len(dataSet)) # 百分比
infoGain = 0.0 - prop * calcShannonEnt(subDataSet)
if infoGain > bestFeatureIndex:
bestFeatureIndex = infoGain
bestFeatureIndex = i
return bestFeatureIndex
# 构建决策树
def createTree(dataSet, labels):
classList = [oneData[-1] for oneData in dataSet]
# 类别全部相同,就不用分(即label都相同)
if classList.count(classList[0]) == len(dataSet):
return classList[0]
# 由于可能存在没有属性的情况,最后还有几个不能分,此时,可以考虑将数量多的作为最终的结果。
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeatureIndex = chooseBestFeatureToSplit(dataSet)
bestFeatureLabel = labels[bestFeatureIndex]
myTree = {bestFeatureLabel: {}} # 通过字典来构建决策树
featureList = [feature[0] for feature in dataSet]
featureSet = set(featureList) # 将list转化为set集合,提取出每一列的特征项(不重复)
del(labels[bestFeatureIndex])
for value in featureSet:
sublabels = labels[:]
myTree[bestFeatureLabel][value] = createTree(splitDataSet
(dataSet, bestFeatureIndex, value), sublabels)
return myTree
# 找出最多的项
def majorityCnt(classList):
countList = {}
for oneData in classList:
if oneData not in countList.keys():
countList[oneData] = 0
countList[oneData] += 1
# 从大到小排序,并返回最大值
sortedList = sorted(countList.iteritems(),key=operator.itemgetter(1),reverse=True)
return sortedList[0][0]
dataSet,labels = createDataSet()
myTree = createTree(dataSet,labels)
print(myTree)
欢迎使用 {小书匠}(xiaoshujiang)编辑器,您可以通过==设置==里的修改模板来改变新建文章的内容。