题目
图4.2是一个递归算法,若面临巨量数据, 则决策树的层数会很深,使用递归方法易导致“栈”溢出。试使用“队列”数据结构,以参数MaxDepth控制树的最大深度,写出与图4.2等价、但不使用递归的决策树生成算法。
思考
我没写出严格的算法,是将对率回归决策树那一题的代码改了一下,实现不用递归,深度可控的决策树。不过那一题生成的树本来就比较矮,无法有更多的对照,之后我用别的数据集再做一下,先保留。
看到这题想到了之前牛客上看到的一道编程题,题目要求是“从上往下打印出二叉树的每个节点,同层节点从左至右打印” 这道题用的方法就是设置一个队列,用前序遍历,每读取一个节点,就将它的左右节点放到队列中,这样就可以保证按层访问二叉树的节点。所以这里是同样的思想,可以大致归纳为:
1.初始化一个队列,并将头结点放入队列中。
2.用一个while循环,当队列为空时停止。
3.让一个节点出队列作为当前节点。如果当前节点中的数据都为一类,则把该节点设置为叶子节点。或者数据集只剩下类,也把当前节点设置为叶子节点,投票选出最多的类为该叶子节点的值。然后跳过下面的代码,继续得到下一个出队列的节点。
4.如果当前节点的深度小于MaxSize,则继续划分。否则得到叶子节点。
主要代码
节点类
class TreeNode:
def __init__(self, dataSet, depth, beta):
self.dataSet = dataSet # 保存当前节点的数据
self.depth = depth # 当前节点的深度
self.beta = beta # 当前节点的对率回归参数
self.classLabel = -1 # 当为叶子节点时,设为类值,否则设为-1
self.left = None # 左子树
self.right = None # 右子树
生成树
关于python中队列的一些操作。
- from queue import Queue:queue为一个先入先出的队列
- q = Queue(maxsize=0) :初始化一个队列,maxsize表示队列的长度,当小于等于0时表示队列无限长。
- q.put(a): 元素a入队列
- q.get():返回队列首的元素,同时在队列中删除该元素
from queue import Queue
# 定义节点类的版本,这一版本在节点处保存权重,容易计算精确度。
def createTreeV3(dataSet, features, MaxDepth):
q = Queue(maxsize=0) # maxsize <= 0 则队列无限长
bestBeta = gradDescent(dataSet[:, :-1], dataSet[:, -1], 0.1, 500)
# 把头结点放入队列
headNode = TreeNode(dataSet, 1, bestBeta)
headNode.beta = bestBeta
q.put(headNode)
while q.empty() is not True:
nowNode = q.get()
classList = nowNode.dataSet[:, -1].tolist()
if classList.count(classList[0]) == len(classList):
nowNode.beta = None # 把叶子节点的beta设为None,方便画图
nowNode.classLabel = classList[0]
continue
if len(dataSet[0]) == 1:
nowNode.beta = None
nowNode.classLabel = majorityCnt(classList)
continue
# 如果当前出队列的节点深度没有达到最大深度,则继续生成新结点
if nowNode.depth != MaxDepth:
# 分两个子集
subDataIs, subDataNotIs = splitDataSet(nowNode.dataSet, nowNode.beta)
# 求得beta
betaL = gradDescent(subDataIs[:, :-1], subDataIs[:, -1], 0.1, 500)
betaR = gradDescent(subDataNotIs[:, :-1], subDataNotIs[:, -1], 0.1, 500)
# 生成新结点
NodeL = TreeNode(subDataIs, nowNode.depth + 1, betaL)
NodeR = TreeNode(subDataNotIs, nowNode.depth + 1, betaR)
# 让当前节点的左右节点指向新生成的两个节点
nowNode.left = NodeL
nowNode.right = NodeR
# 新生成的节点入队列
q.put(NodeL)
q.put(NodeR)
# 否则对于当前节点生成好瓜和坏瓜的结果
else:
nowDataSetL, nowDataSetR = splitDataSet(nowNode.dataSet, nowNode.beta)
leafL = TreeNode(nowDataSetL, None, None)
leafR = TreeNode(nowDataSetR, None, None)
leafL.classLabel = majorityCnt(nowDataSetL[:, -1].tolist())
leafR.classLabel = majorityCnt(nowDataSetR[:, -1].tolist())
nowNode.left = leafL
nowNode.right = leafR
return headNode
测试
图像与精度
因为这里树只有2层那么深,所以只有两种例子
maxSize >= 2
maxSize = 1
完整代码
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import pandas as pd
import operator
from queue import Queue
# 特征字典,后面用到了好多次,干脆当全局变量了
featureDic = {
'色泽': ['浅白', '青绿', '乌黑'],
'根蒂': ['硬挺', '蜷缩', '稍蜷'],
'敲声': ['沉闷', '浊响', '清脆'],
'纹理': ['清晰', '模糊', '稍糊'],
'脐部': ['凹陷', '平坦', '稍凹'],
'触感': ['硬滑', '软粘']}
# ***********************画图***********************
# **********************start***********************
# 详情参见机器学习实战决策树那一章
# 定义文本框和箭头格式
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
mpl.rcParams['font.sans-serif'] = ['SimHei'] # 没有这句话汉字都是口口
# mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
def plotMidText(cntrPt, parentPt, txtString):
xMid = 2 * (parentPt[0] - cntrPt[0]) / 5.0 + cntrPt[0]
yMid = 2 * (parentPt[1] - cntrPt[1]) / 5.0 + cntrPt[1]
createPlot.ax1.text(xMid, yMid, txtString, fontsize=20)
def plotNode(nodeTxt, centerPt, parentPt, nodeType): # 绘制带箭头的注解
createPlot.ax1.annotate(nodeTxt,
xy=parentPt,
xycoords="axes fraction",
xytext=centerPt,
textcoords="axes fraction",
va="center",
ha="center",
bbox=nodeType,
arrowprops=arrow_args,
fontsize=12)
def getNumLeafs(myTree): # 获取叶节点的数目
"""
得到树的叶子数
:param myTree:
:return:
"""
if myTree.beta is None:
return 1
if myTree is None:
return 0
return getNumLeafs(myTree.left) + getNumLeafs(myTree.right)
def getTreeDepth(myTree): # 获取树的层数
"""
得到树的深度
:param myTree:
:return:
"""
if myTree is None:
return 0
# 1表示加上当前节点
depth = max(1 + getTreeDepth(myTree.left),
1 + getTreeDepth(myTree.right))
return depth
def plotTree(myTree, parentPt, nodeTxt, features): # if the first key tells you what feat was split on
"""
递归画树
:param myTree: 树节点
:param parentPt: 父节点坐标
:param nodeTxt: 节点字符
:return:
"""
numLeafs = getNumLeafs(myTree) # this determines the x width of this tree
depth = getTreeDepth(myTree)
cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)
# beta不是None, 则不是叶子节点
if myTree.beta is not None:
plotMidText(cntrPt, parentPt, nodeTxt)
# 节点文字
txt = ""
for i in range(len(myTree.beta)):
if i == 0:
continue
if myTree.beta[i] > 0:
txt += "+" + str(myTree.beta[i][0]) + "x" + features[i - 1] + '\n'
else:
txt += str(myTree.beta[i][0]) + "x" + features[i - 1] + '\n'
txt += "<=" + str(-myTree.beta[0][0])
plotNode(txt, cntrPt, parentPt, decisionNode)
plotTree.yOff = plotTree.yOff - 1 / plotTree.totalD
else:
plotTree.xOff = plotTree.xOff + 1 / plotTree.totalW
plotMidText(cntrPt, parentPt, nodeTxt)
leafTxt = ""
if myTree.classLabel == 1:
leafTxt = "好瓜"
else:
leafTxt = "坏瓜"
plotNode(leafTxt, cntrPt, parentPt, decisionNode)
plotTree.yOff = plotTree.yOff - 1 / plotTree.totalD
if myTree.left is not None:
plotTree(myTree.left, cntrPt, "是", features)
if myTree.right is not None:
plotTree(myTree.right, cntrPt, "否", features)
plotTree.yOff = plotTree.yOff + 1 / plotTree.totalD
def createPlot(inTree, features):
fig = plt.figure(1, figsize=(600, 30), facecolor='white')
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xOff = -0.5 / plotTree.totalW
plotTree.yOff = 0.9
plotTree(inTree, (0.5, 0.9), '', features)
plt.show()
# ***********************画图***********************
# ***********************end************************
def getDataSet():
"""
get watermelon data set 3.0 alpha.
:return: 编码好的数据集以及特征的字典。
"""
dataSet = [
['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460, 1],
['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.774, 0.376, 1],
['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.634, 0.264, 1],
['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.608, 0.318, 1],
['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.556, 0.215, 1],
['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.403, 0.237, 1],
['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', 0.481, 0.149, 1],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', 0.437, 0.211, 1],
['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', 0.666, 0.091, 0],
['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', 0.243, 0.267, 0],
['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057, 0],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', 0.343, 0.099, 0],
['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', 0.639, 0.161, 0],
['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0.657, 0.198, 0],
['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.360, 0.370, 0],
['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0.593, 0.042, 0],
['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0.719, 0.103, 0]
]
features = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖量']
# features = ['color', 'root', 'knocks', 'texture', 'navel', 'touch', 'density', 'sugar']
# #得到特征值字典,本来用这个生成的特征字典,还是直接当全局变量方便
# featureDic = {}
# for i in range(len(features)):
# featureList = [example[i] for example in dataSet]
# uniqueFeature = list(set(featureList))
# featureDic[features[i]] = uniqueFeature
# 每种特征的属性个数
numList = [] # [3, 3, 3, 3, 3, 2]
for i in range(len(features) - 2):
numList.append(len(featureDic[features[i]]))
dataSet = np.array(dataSet)
return dataSet[:, :-1], dataSet[:, -1], features
# data, classLabel, feature = getDataSet()
# print(data)
# print(classLabel)
# print(feature)
def newData():
"""
利用pandas将分类变量转化为数值变量。将分类变量进行one-hot编码。
:return: 变量全为数值的变量,以及新的特征标签。
"""
dataSet, classLabel, features = getDataSet()
df = pd.DataFrame(dataSet)
df.columns = features
# 类别变量转化为数字变量
# features = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖量']
# features = ['color', 'root', 'knocks', 'texture', 'navel', 'touch', 'density', 'sugar']
# 色泽
color = pd.get_dummies(df.色泽, prefix="色泽")
# 根蒂
root = pd.get_dummies(df.根蒂, prefix="根蒂")
# 敲声
knocks = pd.get_dummies(df.敲声, prefix="敲声")
# 纹理
texture = pd.get_dummies(df.纹理, prefix="纹理")
# 脐部
navel = pd.get_dummies(df.脐部, prefix="脐部")
# 触感
touch = pd.get_dummies(df.触感, prefix="触感")
# 密度和含糖量
densityAndsugar = pd.DataFrame()
densityAndsugar["密度"] = df.密度
densityAndsugar["含糖量"] = df.含糖量
# 融合
newData = pd.concat([color, root, knocks, texture, navel, touch, densityAndsugar], axis=1)
newFeatures = list(newData.columns)
newData = np.asarray(newData, dtype="float64")
classLabel = np.asarray(classLabel, dtype="int").reshape(-1, 1)
# 新的特征数据和类融合
newDataSet = np.concatenate((newData, classLabel), axis=1)
# 在第一列添加1
newDataSet = np.insert(newDataSet, 0,
np.ones(dataSet.shape[0]),
axis=1)
return newDataSet, newFeatures
# data, feature = newData()
# print(data)
# print(feature)
# 对率回归
def sigmoid(Z):
return 1.0 / (1 + np.exp(-Z))
# def newton(dataArr, labelArr):
# """
# calculate logistic parameters by newton method
#
# :param dataArr: input data set with shape (m, n)
# :param labelArr: the label of data set with shape (m, 1)
#
# :return: returns the parameters obtained by newton method
# """
# m, n = dataArr.shape
# labelArr = labelArr.reshape(-1, 1)
# beta = np.ones((n, 1))
# errList = [] # save error history
#
# z = np.dot(dataArr, beta)
# oldLbeta = 0
# # shape (m, 1)
# newLBetaMat = -labelArr * z + np.log(1 + sigmoid(z))
# newLBeta = np.sum(newLBetaMat)
# it = 0
# while abs(oldLbeta - newLBeta) > 1e-5:
# it += 1
# # py0 = p(y=0|x) with shape (m,1)
# py0 = sigmoid(-np.dot(dataArr, beta))
# py1 = 1 - py0
# # 'reshape(n)' get shape (n,); 'np.diag' get diagonal matrix with shape (m,m)
# p = np.diag((py0 * py1).reshape(m))
#
# # shape (m,n)
# dBetaMat = -dataArr * (labelArr - py1)
# # first derivative with shape (1, n)
# dBeta = np.sum(dBetaMat, axis=0, keepdims=True)
# # second derivative with shape (n, n)
# dBeta2 = dataArr.T.dot(p).dot(dataArr)
# dBeta2Inv = np.linalg.inv(dBeta2)
# # (n,1) (n,1) (n,n) (n,1)
# beta = beta - np.dot(dBeta2Inv, dBeta.T)
#
# z = np.dot(dataArr, beta)
# oldLbeta = newLBeta
# newLBetaMat = -labelArr * z + np.log(1 + sigmoid(z))
# newLBeta = np.sum(newLBetaMat)
#
# pre = predict(beta, dataArr)
# errorRate = cntErrRate(pre, labelArr)
# errList.append(errorRate)
# print("newton iteration is ", it)
# return beta, errList
def gradDescent(dataArr, labelArr, alpha, T):
"""
calculate logistic parameters by gradient descent
:param dataArr: input data set with shape (m, n)
:param labelArr: the label of data set with shape (m, 1)
:param alpha: step length (learning rate)
:param T: iteration
:return: parameters
"""
m, n = dataArr.shape
labelArr = labelArr.reshape(-1, 1)
# errList = []
beta = np.ones((n, 1))
for t in range(T):
# py0 = p(y=1|x) with shape (m,1)
py1 = sigmoid(np.dot(dataArr, beta))
dBetaMat = -dataArr * (labelArr - py1)
# shape (1,n)
dBeta = np.sum(dBetaMat, axis=0, keepdims=True)
beta -= alpha * dBeta.T
# test code
# pre = predict(beta, dataArr)
# errorRate = cntErrRate(pre, labelArr)
# errList.append(errorRate)
return beta
def predict(beta, dataArr):
preArr = sigmoid(np.dot(dataArr, beta))
preArr[preArr > 0.5] = 1
preArr[preArr <= 0.5] = 0
return preArr
# def cntErrRate(preLabel, label):
# """
# calculate error rate
# :param preLabel: predict label
# :param label: real label
# :return: error rate
# """
# m = len(preLabel)
# cnt = 0.0
#
# for i in range(m):
# if preLabel[i] != label[i]:
# cnt += 1.0
# return cnt / float(m)
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount:
classCount[vote] = 0
classCount[vote] += 1
# classCount.items()将字典的key-value对变成元组对,如{'a':1, 'b':2} -> [('a',1),('b',2)]
# operator.itemgetter(1)按照第二个元素次序进行排序
# reverse=True表示从大大到小。[('b',2), ('a',1)]
sortedClassCount = sorted(classCount.items(),
key=operator.itemgetter(1),
reverse=True)
return sortedClassCount[0][0] # 返回第0个元组的第0个值
def splitDataSet(dataSet, bestBeta):
"""
将数据分为两部分
:param dataSet:
:param bestBeta:
:return:
"""
dot = np.dot(dataSet[:, :-1], bestBeta).flatten()
dataSet1 = dataSet[dot <= 0]
dataSet2 = dataSet[dot > 0]
return dataSet1, dataSet2
# 用来保存beta
class TreeNode:
def __init__(self, dataSet, depth, beta):
self.dataSet = dataSet # 保存当前节点的数据
self.depth = depth # 当前节点的深度
self.beta = beta # 当前节点的对率回归参数
self.classLabel = -1 # 当为叶子节点时,设为类值,否则设为-1
self.left = None # 左子树
self.right = None # 右子树
# 定义节点类的版本,这一版本在节点处保存权重,容易计算精确度。
def createTreeV3(dataSet, features, MaxDepth):
q = Queue(maxsize=0) # maxsize <= 0 则队列无限长
bestBeta = gradDescent(dataSet[:, :-1], dataSet[:, -1], 0.1, 500)
# 把头结点放入队列
headNode = TreeNode(dataSet, 1, bestBeta)
headNode.beta = bestBeta
q.put(headNode)
while q.empty() is not True:
nowNode = q.get()
classList = nowNode.dataSet[:, -1].tolist()
if classList.count(classList[0]) == len(classList):
nowNode.beta = None # 把叶子节点的beta设为None,方便画图
nowNode.classLabel = classList[0]
continue
if len(dataSet[0]) == 1:
nowNode.beta = None
nowNode.classLabel = majorityCnt(classList)
continue
# 如果当前出队列的节点深度没有达到最大深度,则继续生成新结点
if nowNode.depth != MaxDepth:
# 分两个子集
subDataIs, subDataNotIs = splitDataSet(nowNode.dataSet, nowNode.beta)
# 求得beta
betaL = gradDescent(subDataIs[:, :-1], subDataIs[:, -1], 0.1, 500)
betaR = gradDescent(subDataNotIs[:, :-1], subDataNotIs[:, -1], 0.1, 500)
# 生成新结点
NodeL = TreeNode(subDataIs, nowNode.depth + 1, betaL)
NodeR = TreeNode(subDataNotIs, nowNode.depth + 1, betaR)
# 让当前节点的左右节点指向新生成的两个节点
nowNode.left = NodeL
nowNode.right = NodeR
# 新生成的节点入队列
q.put(NodeL)
q.put(NodeR)
# 否则对于当前节点生成好瓜和坏瓜的结果
else:
nowDataSetL, nowDataSetR = splitDataSet(nowNode.dataSet, nowNode.beta)
leafL = TreeNode(nowDataSetL, None, None)
leafR = TreeNode(nowDataSetR, None, None)
leafL.classLabel = majorityCnt(nowDataSetL[:, -1].tolist())
leafR.classLabel = majorityCnt(nowDataSetR[:, -1].tolist())
nowNode.left = leafL
nowNode.right = leafR
return headNode
# def createTreeV2(dataSet, features):
# classList = dataSet[:, -1].tolist()
# if classList.count(classList[0]) == len(classList):
# leaf = TreeNode()
# leaf.classLabel = classList[0]
# return leaf
# if len(dataSet[0]) == 1:
# leaf = TreeNode()
# leaf.classLabel = majorityCnt(classList)
# return leaf
#
# bestBeta = gradDescent(dataSet[:, :-1], dataSet[:, -1], 0.1, 500)
#
# Node = TreeNode()
# Node.beta = bestBeta
# subDataIs, subDataNotIs = splitDataSet(dataSet, bestBeta)
# Node.left = createTreeV2(subDataIs, features)
# Node.right = createTreeV2(subDataNotIs, features)
#
# return Node
def classify(data, Tree):
"""
计算一个数据在多变量决策树上的预测结果
:param data:
:param Tree:
:return:
"""
if Tree.classLabel != -1:
return Tree.classLabel
if np.dot(data, Tree.beta) <= 0:
return classify(data, Tree.left)
else:
return classify(data, Tree.right)
def calcTreeAcc(dataSet, Tree):
"""
计算决策树对数据集分类的正确率
:param dataSet:
:param Tree:
:return:
"""
n = dataSet.shape[0]
rightCnt = 0.0
for i in range(n):
if dataSet[0][-1] == classify(dataSet[0][:-1], Tree):
rightCnt += 1
return rightCnt / float(n)
def main():
# test createTreeV2(dataSet, features)
dataSet, features = newData()
myTree = createTreeV3(dataSet, features, 1)
print("正确率 = ", calcTreeAcc(dataSet, myTree))
createPlot(myTree, features)
if __name__ == '__main__':
main()