西瓜书作业4.5(对率回归划分决策树/多变量决策树)

题目

试编程实现基于对率回归进行划分选择的决策树算法,并为表4.3中数据生成一颗决策树。

思路

1.先处理离散值,通过one-hot对离散值编码成数值。
2.通过梯度下降(暂未尝试牛顿法)求得对率回归的一组参数。
3.通过这组参数将数据分为两类。
4.循环达到决策树停止条件。

(这个题做得比较匆忙,因为晚上要赶火车,图画出来之后没有算精度,但是思路应该是对的。通过梯度下降得到对率回归的一组参数,因为这个题生成的树是一颗二叉树,所以定义一个二叉树的节点类比较方便,可以把每个节点的参数保存在对象里,也方便之后对于精度的计算。碍于时间,我套用了前面题目的代码,还是用字典保存的树。因为用到逻辑回归,需要数值型的变量,我这里对类别型的变量用one-hot编码处理了一下。)
实现了用字典保存的决策树以及用类节点保存的决策树。

字典代码

import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import pandas as pd

# 特征字典,后面用到了好多次,干脆当全局变量了
featureDic = {
    '色泽': ['浅白', '青绿', '乌黑'],
    '根蒂': ['硬挺', '蜷缩', '稍蜷'],
    '敲声': ['沉闷', '浊响', '清脆'],
    '纹理': ['清晰', '模糊', '稍糊'],
    '脐部': ['凹陷', '平坦', '稍凹'],
    '触感': ['硬滑', '软粘']}

# ***********************画图***********************
# **********************start***********************
# 详情参见机器学习实战决策树那一章

# 定义文本框和箭头格式
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 没有这句话汉字都是口口


# mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题


def plotMidText(cntrPt, parentPt, txtString):
    xMid = 2 * (parentPt[0] - cntrPt[0]) / 5.0 + cntrPt[0]
    yMid = 2 * (parentPt[1] - cntrPt[1]) / 5.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString, fontsize=20)


def plotNode(nodeTxt, centerPt, parentPt, nodeType):  # 绘制带箭头的注解
    createPlot.ax1.annotate(nodeTxt,
                            xy=parentPt,
                            xycoords="axes fraction",
                            xytext=centerPt,
                            textcoords="axes fraction",
                            va="center",
                            ha="center",
                            bbox=nodeType,
                            arrowprops=arrow_args,
                            fontsize=12)


def getNumLeafs(myTree):  # 获取叶节点的数目
    numLeafs = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            numLeafs += getNumLeafs(secondDict[key])
        else:
            numLeafs += 1
    return numLeafs


def getTreeDepth(myTree):  # 获取树的层数
    maxDepth = 0
    firstStr = list(myTree.keys())[0]
    secondDict = myTree[firstStr]
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            thisDepth = 1 + getTreeDepth(secondDict[key])
        else:
            thisDepth = 1
        if thisDepth > maxDepth: maxDepth = thisDepth
    return maxDepth


def plotTree(myTree, parentPt, nodeTxt):
    numLeafs = getNumLeafs(myTree)
    getTreeDepth(myTree)
    firstStr = list(myTree.keys())[0]
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW,
              plotTree.yOff)
    plotMidText(cntrPt, parentPt, nodeTxt)
    plotNode(firstStr, cntrPt, parentPt, decisionNode)
    secondDict = myTree[firstStr]
    plotTree.yOff = plotTree.yOff - 1.0 / plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__ == 'dict':
            plotTree(secondDict[key], cntrPt, str(key))
        else:
            plotTree.xOff = plotTree.xOff + 1.0 / plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff),
                     cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0 / plotTree.totalD


def createPlot(inTree):
    fig = plt.figure(1, figsize=(600, 30), facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5 / plotTree.totalW
    plotTree.yOff = 0.9
    plotTree(inTree, (0.5, 0.9), '')
    plt.show()


# ***********************画图***********************
# ***********************end************************


def getDataSet():
    """
    get watermelon data set 3.0 alpha.
    :return: 编码好的数据集以及特征的字典。
    """
    dataSet = [
        ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460, 1],
        ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.774, 0.376, 1],
        ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.634, 0.264, 1],
        ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.608, 0.318, 1],
        ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.556, 0.215, 1],
        ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.403, 0.237, 1],
        ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', 0.481, 0.149, 1],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', 0.437, 0.211, 1],
        ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', 0.666, 0.091, 0],
        ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', 0.243, 0.267, 0],
        ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057, 0],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', 0.343, 0.099, 0],
        ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', 0.639, 0.161, 0],
        ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0.657, 0.198, 0],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.360, 0.370, 0],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0.593, 0.042, 0],
        ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0.719, 0.103, 0]
    ]

    features = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖量']
    # features = ['color', 'root', 'knocks', 'texture', 'navel', 'touch', 'density', 'sugar']

    # #得到特征值字典,本来用这个生成的特征字典,还是直接当全局变量方便
    # featureDic = {}
    # for i in range(len(features)):
    #     featureList = [example[i] for example in dataSet]
    #     uniqueFeature = list(set(featureList))
    #     featureDic[features[i]] = uniqueFeature

    # 每种特征的属性个数
    numList = []  # [3, 3, 3, 3, 3, 2]
    for i in range(len(features) - 2):
        numList.append(len(featureDic[features[i]]))

    dataSet = np.array(dataSet)
    return dataSet[:, :-1], dataSet[:, -1], features


# data, classLabel, feature = getDataSet()
# print(data)
# print(classLabel)
# print(feature)


def newData():
    """
    利用pandas将分类变量转化为数值变量。将分类变量进行one-hot编码。
    :return: 变量全为数值的变量,以及新的特征标签。
    """
    dataSet, classLabel, features = getDataSet()
    df = pd.DataFrame(dataSet)
    df.columns = features
    # 类别变量转化为数字变量
    # features = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖量']
    # features = ['color', 'root', 'knocks', 'texture', 'navel', 'touch', 'density', 'sugar']
    # 色泽
    color = pd.get_dummies(df.色泽, prefix="色泽")
    # 根蒂
    root = pd.get_dummies(df.根蒂, prefix="根蒂")
    # 敲声
    knocks = pd.get_dummies(df.敲声, prefix="敲声")
    # 纹理
    texture = pd.get_dummies(df.纹理, prefix="纹理")
    # 脐部
    navel = pd.get_dummies(df.脐部, prefix="脐部")
    # 触感
    touch = pd.get_dummies(df.触感, prefix="触感")
    # 密度和含糖量
    densityAndsugar = pd.DataFrame()
    densityAndsugar["密度"] = df.密度
    densityAndsugar["含糖量"] = df.含糖量
    # 融合
    newData = pd.concat([color, root, knocks, texture, navel, touch, densityAndsugar], axis=1)
    newFeatures = list(newData.columns)
    newData = np.asarray(newData, dtype="float64")
    classLabel = np.asarray(classLabel, dtype="int").reshape(-1, 1)

    # 新的特征数据和类融合
    newDataSet = np.concatenate((newData, classLabel), axis=1)
    # 在第一列添加1
    newDataSet = np.insert(newDataSet, 0,
                           np.ones(dataSet.shape[0]),
                           axis=1)

    return newDataSet, newFeatures


# data, feature = newData()
# print(data)
# print(feature)


# 对率回归
def sigmoid(Z):
    return 1.0 / (1 + np.exp(-Z))


def newton(dataArr, labelArr):
    """
    calculate logistic parameters by newton method

    :param dataArr: input data set with shape (m, n)
    :param labelArr: the label of data set with shape (m, 1)

    :return: returns the parameters obtained by newton method
    """
    m, n = dataArr.shape
    labelArr = labelArr.reshape(-1, 1)
    beta = np.ones((n, 1))
    errList = []  # save error history

    z = np.dot(dataArr, beta)
    oldLbeta = 0
    # shape (m, 1)
    newLBetaMat = -labelArr * z + np.log(1 + sigmoid(z))
    newLBeta = np.sum(newLBetaMat)
    it = 0
    while abs(oldLbeta - newLBeta) > 1e-5:
        it += 1
        # py0 = p(y=0|x) with shape (m,1)
        py0 = sigmoid(-np.dot(dataArr, beta))
        py1 = 1 - py0
        # 'reshape(n)' get shape (n,); 'np.diag' get diagonal matrix with shape (m,m)
        p = np.diag((py0 * py1).reshape(m))

        # shape (m,n)
        dBetaMat = -dataArr * (labelArr - py1)
        # first derivative with shape (1, n)
        dBeta = np.sum(dBetaMat, axis=0, keepdims=True)
        # second derivative with shape (n, n)
        dBeta2 = dataArr.T.dot(p).dot(dataArr)
        dBeta2Inv = np.linalg.inv(dBeta2)
        # (n,1) (n,1)          (n,n)    (n,1)
        beta = beta - np.dot(dBeta2Inv, dBeta.T)

        z = np.dot(dataArr, beta)
        oldLbeta = newLBeta
        newLBetaMat = -labelArr * z + np.log(1 + sigmoid(z))
        newLBeta = np.sum(newLBetaMat)

        pre = predict(beta, dataArr)
        errorRate = cntErrRate(pre, labelArr)
        errList.append(errorRate)
    print("newton iteration is ", it)
    return beta, errList


def gradDescent(dataArr, labelArr, alpha, T):
    """
    calculate logistic parameters by gradient descent

    :param dataArr: input data set with shape (m, n)
    :param labelArr: the label of data set with shape (m, 1)
    :param alpha: step length (learning rate)
    :param T: iteration
    :return: parameters
    """
    m, n = dataArr.shape
    labelArr = labelArr.reshape(-1, 1)
    # errList = []

    beta = np.ones((n, 1))
    for t in range(T):
        # py0 = p(y=1|x) with shape (m,1)
        py1 = sigmoid(np.dot(dataArr, beta))
        dBetaMat = -dataArr * (labelArr - py1)
        # shape (1,n)
        dBeta = np.sum(dBetaMat, axis=0, keepdims=True)
        beta -= alpha * dBeta.T

        # test code
        # pre = predict(beta, dataArr)
        # errorRate = cntErrRate(pre, labelArr)
        # errList.append(errorRate)

    return beta


# dataSet, features = newData()
# testBeta = gradDescent(dataSet[:, :-1], dataSet[:, -1], 0.1, 500)
# print(testBeta)


def predict(beta, dataArr):
    preArr = sigmoid(np.dot(dataArr, beta))
    preArr[preArr > 0.5] = 1
    preArr[preArr <= 0.5] = 0

    return preArr


def cntErrRate(preLabel, label):
    """
    calculate error rate
    :param preLabel: predict label
    :param label: real label
    :return: error rate
    """
    m = len(preLabel)
    cnt = 0.0

    for i in range(m):
        if preLabel[i] != label[i]:
            cnt += 1.0
    return cnt / float(m)


def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount:
            classCount[vote] = 0
        classCount[vote] += 1
    # classCount.items()将字典的key-value对变成元组对,如{'a':1, 'b':2} -> [('a',1),('b',2)]
    # operator.itemgetter(1)按照第二个元素次序进行排序
    # reverse=True表示从大大到小。[('b',2), ('a',1)]
    sortedClassCount = sorted(classCount.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
    return sortedClassCount[0][0]  # 返回第0个元组的第0个值


def splitDataSet(dataSet, bestBeta):
    """
    将数据分为两部分
    :param dataSet:
    :param bestBeta:
    :return:
    """
    dot = np.dot(dataSet[:, :-1], bestBeta).flatten()
    dataSet1 = dataSet[dot <= 0]
    dataSet2 = dataSet[dot > 0]
    return dataSet1, dataSet2


# # 用来保存beta
# class TreeNode:
#     def __init__(self):
#         self.beta = beta
#         self.classLabel = -1
#         self.left = None
#         self.right = None
# 定义节点类的版本,这一版本在节点处保存权重,容易计算精确度。
# def createTreeV2(dataSet, features):
#     classList = dataSet[:, -1].tolist()
#     if classList.count(classList[0]) == len(classList):
#         leaf = TreeNode()
#         leaf.classLabel = classList[0]
#         return leaf
#     if len(dataSet[0]) == 1:
#         leaf = TreeNode(-1)
#         leaf.classLabel = majorityCnt(classList)
#         return leaf
#
#     bestBeta = gradDescent(dataSet[:, :-1], dataSet[:, -1], 0.1, 500)
#
#     Node = TreeNode()
#     subDataIs, subDataNotIs = splitDataSet(dataSet, bestBeta)
#     Node.left = createTreeV2(subDataIs, features)
#     Node.right = createTreeV2(subDataNotIs, features)
#
#     return Node


def createTree(dataSet, features):
    classList = dataSet[:, -1].tolist()
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)

    bestBeta = gradDescent(dataSet[:, :-1], dataSet[:, -1], 0.1, 500)
    # 得到节点txt
    txt = ""
    for i in range(len(bestBeta)):
        if i == 0:
            continue
        if bestBeta[i] > 0:
            txt += "+" + str(bestBeta[i][0]) + "x" + features[i - 1] + '\n'
        else:
            txt += str(bestBeta[i][0]) + "x" + features[i - 1] + '\n'
    txt += "<=" + str(-bestBeta[0][0])

    myTree = {txt: {}}
    subDataIs, subDataNotIs = splitDataSet(dataSet, bestBeta)
    myTree[txt]['是'] = createTree(subDataIs, features)
    myTree[txt]['否'] = createTree(subDataNotIs, features)

    return myTree


def main():
    dataSet, features = newData()
    myTree = createTree(dataSet, features)
    print(myTree)
    createPlot(myTree)


if __name__ == '__main__':
    main()

图形

在这里插入图片描述

类结点代码

import numpy as np
import matplotlib.pyplot as plt
from pylab import *
import pandas as pd

# 特征字典,后面用到了好多次,干脆当全局变量了
featureDic = {
    '色泽': ['浅白', '青绿', '乌黑'],
    '根蒂': ['硬挺', '蜷缩', '稍蜷'],
    '敲声': ['沉闷', '浊响', '清脆'],
    '纹理': ['清晰', '模糊', '稍糊'],
    '脐部': ['凹陷', '平坦', '稍凹'],
    '触感': ['硬滑', '软粘']}

# ***********************画图***********************
# **********************start***********************
# 详情参见机器学习实战决策树那一章

# 定义文本框和箭头格式
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="<-")
mpl.rcParams['font.sans-serif'] = ['SimHei']  # 没有这句话汉字都是口口


# mpl.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题


def plotMidText(cntrPt, parentPt, txtString):
    xMid = 2 * (parentPt[0] - cntrPt[0]) / 5.0 + cntrPt[0]
    yMid = 2 * (parentPt[1] - cntrPt[1]) / 5.0 + cntrPt[1]
    createPlot.ax1.text(xMid, yMid, txtString, fontsize=20)


def plotNode(nodeTxt, centerPt, parentPt, nodeType):  # 绘制带箭头的注解
    createPlot.ax1.annotate(nodeTxt,
                            xy=parentPt,
                            xycoords="axes fraction",
                            xytext=centerPt,
                            textcoords="axes fraction",
                            va="center",
                            ha="center",
                            bbox=nodeType,
                            arrowprops=arrow_args,
                            fontsize=12)


def getNumLeafs(myTree):  # 获取叶节点的数目
    """
    得到树的叶子数
    :param myTree:
    :return:
    """
    if myTree.beta is None:
        return 1
    if myTree is None:
        return 0
    return getNumLeafs(myTree.left) + getNumLeafs(myTree.right)


def getTreeDepth(myTree):  # 获取树的层数
    """
    得到树的深度
    :param myTree:
    :return:
    """
    if myTree is None:
        return 0

    # 1表示加上当前节点
    depth = max(1 + getTreeDepth(myTree.left),
                1 + getTreeDepth(myTree.right))
    return depth


def plotTree(myTree, parentPt, nodeTxt, features):  # if the first key tells you what feat was split on
    """
    递归画树
    :param myTree: 树节点
    :param parentPt: 父节点坐标
    :param nodeTxt: 节点字符
    :return:
    """
    numLeafs = getNumLeafs(myTree)  # this determines the x width of this tree
    depth = getTreeDepth(myTree)
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs)) / 2.0 / plotTree.totalW, plotTree.yOff)

    # beta不是None, 则不是叶子节点
    if myTree.beta is not None:
        plotMidText(cntrPt, parentPt, nodeTxt)
        # 节点文字
        txt = ""
        for i in range(len(myTree.beta)):
            if i == 0:
                continue
            if myTree.beta[i] > 0:
                txt += "+" + str(myTree.beta[i][0]) + "x" + features[i - 1] + '\n'
            else:
                txt += str(myTree.beta[i][0]) + "x" + features[i - 1] + '\n'
        txt += "<=" + str(-myTree.beta[0][0])
        plotNode(txt, cntrPt, parentPt, decisionNode)
        plotTree.yOff = plotTree.yOff - 1 / plotTree.totalD
    else:
        plotTree.xOff = plotTree.xOff + 1 / plotTree.totalW
        plotMidText(cntrPt, parentPt, nodeTxt)
        leafTxt = ""
        if myTree.classLabel == 1:
            leafTxt = "好瓜"
        else:
            leafTxt = "坏瓜"
        plotNode(leafTxt, cntrPt, parentPt, decisionNode)
        plotTree.yOff = plotTree.yOff - 1 / plotTree.totalD

    if myTree.left is not None:
        plotTree(myTree.left, cntrPt, "是", features)
    if myTree.right is not None:
        plotTree(myTree.right, cntrPt, "否", features)

    plotTree.yOff = plotTree.yOff + 1 / plotTree.totalD


def createPlot(inTree, features):
    fig = plt.figure(1, figsize=(600, 30), facecolor='white')
    fig.clf()
    axprops = dict(xticks=[], yticks=[])
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
    plotTree.totalW = float(getNumLeafs(inTree))
    plotTree.totalD = float(getTreeDepth(inTree))
    plotTree.xOff = -0.5 / plotTree.totalW
    plotTree.yOff = 0.9
    plotTree(inTree, (0.5, 0.9), '', features)
    plt.show()


# ***********************画图***********************
# ***********************end************************


def getDataSet():
    """
    get watermelon data set 3.0 alpha.
    :return: 编码好的数据集以及特征的字典。
    """
    dataSet = [
        ['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.697, 0.460, 1],
        ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.774, 0.376, 1],
        ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.634, 0.264, 1],
        ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', 0.608, 0.318, 1],
        ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', 0.556, 0.215, 1],
        ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.403, 0.237, 1],
        ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', 0.481, 0.149, 1],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', 0.437, 0.211, 1],
        ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', 0.666, 0.091, 0],
        ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', 0.243, 0.267, 0],
        ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', 0.245, 0.057, 0],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', 0.343, 0.099, 0],
        ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', 0.639, 0.161, 0],
        ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', 0.657, 0.198, 0],
        ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', 0.360, 0.370, 0],
        ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', 0.593, 0.042, 0],
        ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', 0.719, 0.103, 0]
    ]

    features = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖量']
    # features = ['color', 'root', 'knocks', 'texture', 'navel', 'touch', 'density', 'sugar']

    # #得到特征值字典,本来用这个生成的特征字典,还是直接当全局变量方便
    # featureDic = {}
    # for i in range(len(features)):
    #     featureList = [example[i] for example in dataSet]
    #     uniqueFeature = list(set(featureList))
    #     featureDic[features[i]] = uniqueFeature

    # 每种特征的属性个数
    numList = []  # [3, 3, 3, 3, 3, 2]
    for i in range(len(features) - 2):
        numList.append(len(featureDic[features[i]]))

    dataSet = np.array(dataSet)
    return dataSet[:, :-1], dataSet[:, -1], features


# data, classLabel, feature = getDataSet()
# print(data)
# print(classLabel)
# print(feature)


def newData():
    """
    利用pandas将分类变量转化为数值变量。将分类变量进行one-hot编码。
    :return: 变量全为数值的变量,以及新的特征标签。
    """
    dataSet, classLabel, features = getDataSet()
    df = pd.DataFrame(dataSet)
    df.columns = features
    # 类别变量转化为数字变量
    # features = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '密度', '含糖量']
    # features = ['color', 'root', 'knocks', 'texture', 'navel', 'touch', 'density', 'sugar']
    # 色泽
    color = pd.get_dummies(df.色泽, prefix="色泽")
    # 根蒂
    root = pd.get_dummies(df.根蒂, prefix="根蒂")
    # 敲声
    knocks = pd.get_dummies(df.敲声, prefix="敲声")
    # 纹理
    texture = pd.get_dummies(df.纹理, prefix="纹理")
    # 脐部
    navel = pd.get_dummies(df.脐部, prefix="脐部")
    # 触感
    touch = pd.get_dummies(df.触感, prefix="触感")
    # 密度和含糖量
    densityAndsugar = pd.DataFrame()
    densityAndsugar["密度"] = df.密度
    densityAndsugar["含糖量"] = df.含糖量
    # 融合
    newData = pd.concat([color, root, knocks, texture, navel, touch, densityAndsugar], axis=1)
    newFeatures = list(newData.columns)
    newData = np.asarray(newData, dtype="float64")
    classLabel = np.asarray(classLabel, dtype="int").reshape(-1, 1)

    # 新的特征数据和类融合
    newDataSet = np.concatenate((newData, classLabel), axis=1)
    # 在第一列添加1
    newDataSet = np.insert(newDataSet, 0,
                           np.ones(dataSet.shape[0]),
                           axis=1)

    return newDataSet, newFeatures


# data, feature = newData()
# print(data)
# print(feature)


# 对率回归
def sigmoid(Z):
    return 1.0 / (1 + np.exp(-Z))


# def newton(dataArr, labelArr):
#     """
#     calculate logistic parameters by newton method
#
#     :param dataArr: input data set with shape (m, n)
#     :param labelArr: the label of data set with shape (m, 1)
#
#     :return: returns the parameters obtained by newton method
#     """
#     m, n = dataArr.shape
#     labelArr = labelArr.reshape(-1, 1)
#     beta = np.ones((n, 1))
#     errList = []  # save error history
#
#     z = np.dot(dataArr, beta)
#     oldLbeta = 0
#     # shape (m, 1)
#     newLBetaMat = -labelArr * z + np.log(1 + sigmoid(z))
#     newLBeta = np.sum(newLBetaMat)
#     it = 0
#     while abs(oldLbeta - newLBeta) > 1e-5:
#         it += 1
#         # py0 = p(y=0|x) with shape (m,1)
#         py0 = sigmoid(-np.dot(dataArr, beta))
#         py1 = 1 - py0
#         # 'reshape(n)' get shape (n,); 'np.diag' get diagonal matrix with shape (m,m)
#         p = np.diag((py0 * py1).reshape(m))
#
#         # shape (m,n)
#         dBetaMat = -dataArr * (labelArr - py1)
#         # first derivative with shape (1, n)
#         dBeta = np.sum(dBetaMat, axis=0, keepdims=True)
#         # second derivative with shape (n, n)
#         dBeta2 = dataArr.T.dot(p).dot(dataArr)
#         dBeta2Inv = np.linalg.inv(dBeta2)
#         # (n,1) (n,1)          (n,n)    (n,1)
#         beta = beta - np.dot(dBeta2Inv, dBeta.T)
#
#         z = np.dot(dataArr, beta)
#         oldLbeta = newLBeta
#         newLBetaMat = -labelArr * z + np.log(1 + sigmoid(z))
#         newLBeta = np.sum(newLBetaMat)
#
#         pre = predict(beta, dataArr)
#         errorRate = cntErrRate(pre, labelArr)
#         errList.append(errorRate)
#     print("newton iteration is ", it)
#     return beta, errList


def gradDescent(dataArr, labelArr, alpha, T):
    """
    calculate logistic parameters by gradient descent

    :param dataArr: input data set with shape (m, n)
    :param labelArr: the label of data set with shape (m, 1)
    :param alpha: step length (learning rate)
    :param T: iteration
    :return: parameters
    """
    m, n = dataArr.shape
    labelArr = labelArr.reshape(-1, 1)
    # errList = []

    beta = np.ones((n, 1))
    for t in range(T):
        # py0 = p(y=1|x) with shape (m,1)
        py1 = sigmoid(np.dot(dataArr, beta))
        dBetaMat = -dataArr * (labelArr - py1)
        # shape (1,n)
        dBeta = np.sum(dBetaMat, axis=0, keepdims=True)
        beta -= alpha * dBeta.T

        # test code
        # pre = predict(beta, dataArr)
        # errorRate = cntErrRate(pre, labelArr)
        # errList.append(errorRate)

    return beta


def predict(beta, dataArr):
    preArr = sigmoid(np.dot(dataArr, beta))
    preArr[preArr > 0.5] = 1
    preArr[preArr <= 0.5] = 0

    return preArr


# def cntErrRate(preLabel, label):
#     """
#     calculate error rate
#     :param preLabel: predict label
#     :param label: real label
#     :return: error rate
#     """
#     m = len(preLabel)
#     cnt = 0.0
#
#     for i in range(m):
#         if preLabel[i] != label[i]:
#             cnt += 1.0
#     return cnt / float(m)


def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        if vote not in classCount:
            classCount[vote] = 0
        classCount[vote] += 1
    # classCount.items()将字典的key-value对变成元组对,如{'a':1, 'b':2} -> [('a',1),('b',2)]
    # operator.itemgetter(1)按照第二个元素次序进行排序
    # reverse=True表示从大大到小。[('b',2), ('a',1)]
    sortedClassCount = sorted(classCount.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
    return sortedClassCount[0][0]  # 返回第0个元组的第0个值


def splitDataSet(dataSet, bestBeta):
    """
    将数据分为两部分
    :param dataSet:
    :param bestBeta:
    :return:
    """
    dot = np.dot(dataSet[:, :-1], bestBeta).flatten()
    dataSet1 = dataSet[dot <= 0]
    dataSet2 = dataSet[dot > 0]
    return dataSet1, dataSet2


# 用来保存beta
class TreeNode:
    def __init__(self):
        self.beta = None
        self.classLabel = -1
        self.left = None
        self.right = None


# 定义节点类的版本,这一版本在节点处保存权重,容易计算精确度。
def createTreeV2(dataSet, features):
    classList = dataSet[:, -1].tolist()
    if classList.count(classList[0]) == len(classList):
        leaf = TreeNode()
        leaf.classLabel = classList[0]
        return leaf
    if len(dataSet[0]) == 1:
        leaf = TreeNode()
        leaf.classLabel = majorityCnt(classList)
        return leaf

    bestBeta = gradDescent(dataSet[:, :-1], dataSet[:, -1], 0.1, 500)

    Node = TreeNode()
    Node.beta = bestBeta
    subDataIs, subDataNotIs = splitDataSet(dataSet, bestBeta)
    Node.left = createTreeV2(subDataIs, features)
    Node.right = createTreeV2(subDataNotIs, features)

    return Node


def classify(data, Tree):
    """
    计算一个数据在多变量决策树上的预测结果
    :param data:
    :param Tree:
    :return:
    """
    if Tree.classLabel != -1:
        return Tree.classLabel

    if np.dot(data, Tree.beta) <= 0:
        return classify(data, Tree.left)
    else:
        return classify(data, Tree.right)


def calcTreeAcc(dataSet, Tree):
    """
    计算决策树对数据集分类的正确率
    :param dataSet:
    :param Tree:
    :return:
    """
    n = dataSet.shape[0]
    rightCnt = 0.0
    for i in range(n):
        if dataSet[0][-1] == classify(dataSet[0][:-1], Tree):
            rightCnt += 1
    return rightCnt / float(n)


def main():
    # test createTreeV2(dataSet, features)
    dataSet, features = newData()
    myTree = createTreeV2(dataSet, features)
    print("多变量决策树的正确率 = ", calcTreeAcc(dataSet, myTree))
    createPlot(myTree, features)


if __name__ == '__main__':
    main()

画图

在这里插入图片描述

准确率

在这里插入图片描述

  • 7
    点赞
  • 73
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
基于对率回归进行划分选择决策树算法可以通过以下步骤来实现: 1. 导入所需的库和模块,如sklearn.linear_model的LogisticRegression库和numpy库等。 2. 定义一个函数来计算每个属性的正确率,以便选择正确率最大的属性作为根节点。 3. 使用LogisticRegression库对每个属性进行预测,并计算预测结果与真实结果的准确性。 4. 选择正确率最高的属性作为根节点,并将数据集根据该属性的取值进行划分。 5. 对每个划分后的子数据集,重复步骤2-4,直到达到停止条件(如所有数据属于同一类别,或者达到预定的树的深度)。 6. 绘制决策树,可以使用图形库如matplotlib来可视化决策树的结果。 请注意,以上步骤是一种基本的实现方法,具体的实现细节可能因算法选择编程语言和库的不同而有所变化。你可以参考引用提供的调用LogisticRegression库和引用提供的python实现链接来帮助你更详细地编写代码。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *2* *3* [python 实现对率回归决策树](https://blog.csdn.net/qq_36949278/article/details/89597491)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 100%"] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值