一、决策树
1.介绍
决策树是在已知各种情况发生概率的基础上,通过构成决策树来求取净现值的期望值大于等于零的概率,评价项目风险,判断其可行性的决策分析方法,是直观运用概率分析的一种图解法
主要算法有:ID3、C4.5、CART
2.决策事件处理流程
(1)构建策略
随着树的增加,结点的熵迅速降低。熵降低的速度越快越好,可以得到高度最矮的决策树
(2)决策树
决策树的基本流程遵循分而治之的策略
(3)伪代码
3.原理
对于一个分支结点,如果该结点所包含的样本都属于同一类,那么它的纯度为1,而纯度越高越好,尽可能多的样本属于同一类,样本集合D中第k类样本所占比例为pk,D的信息熵定义为
基于信息增益的最优属性划分原则——信息增益准则,对可取值数据较多的属性有所偏好
二、ID3决策树
(1)建立决策树
导入包
import pandas as pd
import numpy as np
from collections import Counter
from math import log2
获取数据
#数据获取与处理
def getData(filePath):
data = pd.read_excel(filePath)
return data
def dataDeal(data):
dataList = np.array(data).tolist()
dataSet = [element[1:] for element in dataList]
return dataSet
获取属性
#获取属性名称
def getLabels(data):
labels = list(data.columns)[1:-1]
return labels
获取类别
#获取类别标记
def targetClass(dataSet):
classification = set([element[-1] for element in dataSet])
return classification
叶节点
#将分支结点标记为叶结点,选择样本数最多的类作为类标记
def majorityRule(dataSet):
mostKind = Counter([element[-1] for element in dataSet]).most_common(1)
majorityKind = mostKind[0][0]
return majorityKind
计算信息熵
#计算信息熵
def infoEntropy(dataSet):
classColumnCnt = Counter([element[-1] for element in dataSet])
Ent = 0
for symbol in classColumnCnt:
p_k = classColumnCnt[symbol]/len(dataSet)
Ent = Ent-p_k*log2(p_k)
return Ent
子数据集构建
#子数据集构建
def makeAttributeData(dataSet,value,iColumn):
attributeData = []
for element in dataSet:
if element[iColumn]==value:
row = element[:iColumn]
row.extend(element[iColumn+1:])
attributeData.append(row)
return attributeData
计算信息增益
#计算信息增益
def infoGain(dataSet,iColumn):
Ent = infoEntropy(dataSet)
tempGain = 0.0
attribute = set([element[iColumn] for element in dataSet])
for value in attribute:
attributeData = makeAttributeData(dataSet,value,iColumn)
tempGain = tempGain+len(attributeData)/len(dataSet)*infoEntropy(attributeData)
Gain = Ent-tempGain
return Gain
选择最优属性
#选择最优属性
def selectOptimalAttribute(dataSet,labels):
bestGain = 0
sequence = 0
for iColumn in range(0,len(labels)):#不计最后的类别列
Gain = infoGain(dataSet,iColumn)
if Gain>bestGain:
bestGain = Gain
sequence = iColumn
print(labels[iColumn],Gain)
return sequence#
建立决策树
#建立决策树
def createTree(dataSet,labels):
classification = targetClass(dataSet) #获取类别种类(集合去重)
if len(classification) == 1:
return list(classification)[0]
if len(labels) == 1:
return majorityRule(dataSet)#返回样本种类较多的类别
sequence = selectOptimalAttribute(dataSet,labels)
print(labels)
optimalAttribute = labels[sequence]
del(labels[sequence])
myTree = {optimalAttribute:{}}
attribute = set([element[sequence] for element in dataSet])
for value in attribute:
print(myTree)
print(value)
subLabels = labels[:]
myTree[optimalAttribute][value] = \
createTree(makeAttributeData(dataSet,value,sequence),subLabels)
return myTree
主函数
def main():
filePath = 'watermalon.xls'
data = getData(filePath)
dataSet = dataDeal(data)
labels = getLabels(data)
myTree = createTree(dataSet,labels)
return myTree
生成
if __name__ == '__main__':
myTree = main()
结果
二、绘制决策树
#使用Matlotlib绘制决策树
import matplotlib.pyplot as plt
#设置文本框和箭头格式
decisionNode = dict(boxstyle = "sawtooth", fc = "0.8")
leafNode = dict(boxstyle = "round4", fc = "0.8")
arrow_args = dict(arrowstyle = "<-")
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif'
#画节点
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy = parentPt,\
xycoords = "axes fraction", xytext = centerPt, textcoords = 'axes fraction',\
va = "center", ha = "center", bbox = nodeType, arrowprops = arrow_args)
#获取决策树的叶子节点数
def getNumLeafs(myTree):
leafNumber = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
if(type(secondDict[key]).__name__ == 'dict'):
leafNumber = leafNumber + getNumLeafs(secondDict[key])
else:
leafNumber += 1
return leafNumber
#获取决策树的高度(递归)
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
for key in secondDict.keys():
#test to see if the nodes are dictonaires, if not they are leaf nodes
if type(secondDict[key]).__name__=='dict':
thisDepth = 1 + getTreeDepth(secondDict[key])
else: thisDepth = 1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth
#在父子节点添加信息
def plotMidText(cntrPt, parentPt, txtString):
xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
#画树
def plotTree(myTree, parentPt, nodeTxt):#if the first key tells you what feat was split on
numLeafs = getNumLeafs(myTree) #this determines the x width of this tree
depth = getTreeDepth(myTree)
firstStr = list(myTree.keys())[0] #the text label for this node should be this
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)
plotMidText(cntrPt, parentPt, nodeTxt)
plotNode(firstStr, cntrPt, parentPt, decisionNode)
secondDict = myTree[firstStr]
plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
for key in secondDict.keys():
if type(secondDict[key]).__name__=='dict':
plotTree(secondDict[key],cntrPt,str(key)) #recursion
else: #it's a leaf node print the leaf node
plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#画布初始化
def createPlot(inTree):
fig = plt.figure(1, facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops) #no ticks
#createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
plotTree(inTree, (0.5,1.0), '')
plt.show()
```cpp
在这里插入代码片def main():
#createPlot()
print(getTreeDepth(myTree)) #输出数的深度
print(getNumLeafs(myTree)) #输出数的叶子个数
createPlot(myTree)
main()
三、sk-learn实现决策树
#导入相关库
import pandas as pd
import graphviz
from sklearn.model_selection import train_test_split
from sklearn import tree
f = open('watermalon.csv','r',encoding='utf-8')
data = pd.read_csv(f)
x = data[["色泽","根蒂","敲声","纹理","脐部","触感"]].copy()
y = data['好瓜'].copy()
print(data)
数据转换
#将特征值数值化
x = x.copy()
for i in ["色泽","根蒂","敲声","纹理","脐部","触感"]:
for j in range(len(x)):
if(x[i][j] == "青绿" or x[i][j] == "蜷缩" or data[i][j] == "浊响" \
or x[i][j] == "清晰" or x[i][j] == "凹陷" or x[i][j] == "硬滑"):
x[i][j] = 1
elif(x[i][j] == "乌黑" or x[i][j] == "稍蜷" or data[i][j] == "沉闷" \
or x[i][j] == "稍糊" or x[i][j] == "稍凹" or x[i][j] == "软粘"):
x[i][j] = 2
else:
x[i][j] = 3
y = y.copy()
for i in range(len(y)):
if(y[i] == "是"):
y[i] = int(1)
else:
y[i] = int(-1)
#需要将数据x,y转化好格式,数据框dataframe,否则格式报错
x = pd.DataFrame(x).astype(int)
y = pd.DataFrame(y).astype(int)
print(x)
print(y)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)
print(x_train)
绘制决策树,需要先配置graphviz
https://link.csdn.net/?target=https%3A%2F%2Fzhuanlan.zhihu.com%2Fp%2F268532582
# 加上Graphviz2.38绝对路径
import os
os.environ["PATH"] += os.pathsep + 'C:\Program Files\Graphviz\bin'
feature_name = ["色泽","根蒂","敲声","纹理","脐部","触感"]
dot_data = tree.export_graphviz(clf ,feature_names= feature_name,class_names=["好瓜","坏瓜"],filled=True,rounded=True,out_file =None)
graph = graphviz.Source(dot_data)
graph
四、cart算法
clf = tree.DecisionTreeClassifier(criterion="gini") #实例化
clf = clf.fit(x_train, y_train)
score = clf.score(x_test, y_test)
print(score)
# 加上Graphviz2.38绝对路径
import os
os.environ["PATH"] += os.pathsep + r'C:\Program Files\Graphviz\bin'
feature_name = ["色泽","根蒂","敲声","纹理","脐部","触感"]
dot_data = tree.export_graphviz(clf ,feature_names= feature_name,class_names=["好瓜","坏瓜"],filled=True,rounded=True,out_file =None)
graph = graphviz.Source(dot_data)
graph