第三章决策树
本章主要讲决策树的概念、适用领域、数据集的划分和决策数的构建,这里本人只是简单进行了梳理,以及构建决策树的代码分享。
算法介绍
信息熵
集合信息的度量方式简称熵,信息的期望值,下面是计算熵的计算公式:
其中n是类别数,p(xi)是该类别的概率。
条件熵
其中pi=P(Y|X=xi)的概率。
信息增益
详细介绍可以参考链接:https://zhuanlan.zhihu.com/p/41134986
ID3算法
详细参看链接:https://blog.csdn.net/weixin_43216017/article/details/87474045
测试代码
tree.py
import numpy
from math import log
from tree_show import *
import pickle
class Tree():
def __init__(self):
pass
def CreateDataSet(self):
dataset = [
[1, 1, "yes"],
[1, 1, "yes"],
[1, 0, "no"],
[0, 1, "no"],
[0, 1, "no"]
]
feature_label = ["no surfacing", "flippers"]
return dataset, feature_label
def calcEntropy(self, dataset):
allnums = len(dataset)
labelcounts = {}
for each in dataset:
labelcounts[each[-1]] = labelcounts.get(each[-1], 0) + 1
entropy = 0.0
for key in labelcounts.keys():
p = float(labelcounts[key]) / float(allnums)
entropy += -p * log(p, 2)
return entropy
def splitDataSet(self, dataset, axis, val):
retDataSet = []
for each in dataset:
if each[axis] == val:
savefeature = each[:axis]
savefeature.extend(each[axis+1:])
retDataSet.append(savefeature)
return retDataSet
def chooseBestFeatureToSplit(self, dataset):
featurenums = len(dataset[0]) - 1
allnum = len(dataset)
base_entropy = self.calcEntropy(dataset)
bestinfogain = 0.0
bestfeature = -1
for i in range(featurenums):
featurelist = [each[i] for each in dataset]
featurelist = set(featurelist)
# 条件熵计算
new_entropy = 0.0
for val in featurelist:
splitdataset = self.splitDataSet(dataset, i, val)
pro = float(len(splitdataset)) / float(allnum)
new_entropy += pro * self.calcEntropy(splitdataset)
# 信息增益
infogain = base_entropy - new_entropy
print("feature:", i, ", infogain:", infogain)
if infogain > bestinfogain:
bestinfogain = infogain
bestfeature = i
return bestfeature
def majorClass(self, classlist):
class_dict = {}
for cls in classlist:
class_dict[cls] = class_dict.get(cls, 0) + 1
clssorted = sorted(class_dict.items(), key=lambda item:item[1], reverse=True)
return clssorted[0][0]
def CreateTree(self, dataset, labels):
classlist = [each[-1] for each in dataset]
# 数据集类别完全相同,结束划分
if classlist.count(classlist[0]) == len(classlist):
return classlist[0]
# 遍历完所有特征,返回类别数最多的类别
if len(dataset) == 1:
return self.majorClass(classlist)
bestfeature = self.chooseBestFeatureToSplit(dataset)
bestlabel = labels[bestfeature]
mytree = {bestlabel:{}}
del(labels[bestfeature])
featurevals = [each[bestfeature] for each in dataset]
featurevals = set(featurevals)
for val in featurevals:
sublabel = labels[:]
mytree[bestlabel][val] = self.CreateTree(self.splitDataSet(dataset, bestfeature, val), sublabel)
return mytree
def TreeClassfiy(self, intree, featurelabel, testvec):
firstlabel = list(intree.keys())[0]
seconddict = intree[firstlabel]
print(featurelabel)
index = featurelabel.index(firstlabel)
for key in seconddict.keys():
if key == testvec[index]:
if type(seconddict[key]).__name__ == "dict":
retlabel = self.TreeClassfiy(seconddict[key], featurelabel, testvec)
else:
retlabel = seconddict[key]
return retlabel
def StoreTree(self, tree, savepath):
fp = open(savepath, "wb")
pickle.dump(tree, fp)
fp.close()
def LoadTree(self, treefile):
fp = open(treefile, "rb")
return pickle.load(fp)
if __name__ == '__main__':
tree = Tree()
# 创建数据集
dataset, label = tree.CreateDataSet()
copylabel = label.copy()
# 计算数据集的信息熵
entropy = tree.calcEntropy(dataset)
print("entropy: ", entropy)
# 划分数据集
splitdataset = tree.splitDataSet(dataset, 0, 1)
print(splitdataset)
# 选择最好的特征
feature = tree.chooseBestFeatureToSplit(dataset)
print("bestfeature: ", feature)
# 递归创建决策树
outtree = tree.CreateTree(dataset, copylabel)
print(outtree)
# 决策树可视化
createPlot(outtree)
# 决策树的使用
outcls = tree.TreeClassfiy(outtree, label, [1, 0])
print("pre class: ", outcls)
# 决策树的存储
tree.StoreTree(outtree, "treefile.txt")
# 决策树的加载
loadtree = tree.LoadTree("treefile.txt")
print("load tree:", loadtree)
决策树的可视化代码可以参看链接:https://blog.csdn.net/wancongconghao/article/details/71171981