PYTHON机器学习实战——决策树ＤＴ

最新推荐文章于 2024-07-10 14:45:06 发布

EwenWanW

最新推荐文章于 2024-07-10 14:45:06 发布

阅读量965

点赞数

分类专栏： python学习文章标签：机器学习 python 决策树有监督学习

本文链接：https://blog.csdn.net/xiaoxiaowenqiang/article/details/76731447

版权

python学习专栏收录该内容

16 篇文章 1 订阅

订阅专栏

决策树也是有监督机器学习方法。

决策树算法是找到一个优化的决策路径（决策树），使得每次分类尽可能过滤更多的数据，或者说问的问题尽量少。
决策树算法可以用来优化一些知识系统，帮助用户快速找到答案。

基本概念

属性(Feature): 训练数据中每列都是一个属性。
标签(Label)：训练数据中的分类结果。

如何构造决策树

这里，要解决的问题是采用哪些数据属性作为分类条件，最佳次序是什么？

方法一：采用二分法，或者按照训练数据中的属性依次构造。
方法二：使用香农熵计算公式。这是书中使用的方法。
方法三：使用基尼不纯度2 （Gini impurity）。
流行的算法: C4.5和CART

代码详解


# -*- coding:utf-8 -*-
#!/usr/bin/python

# 测试 import DecTree as DT   DT.test_dt()

from math import log   # 对数
import operator        # 操作符

import copy            # 列表复制，不改变原来的列表


# 画树
import plot_deci_tree as pt

## 自定义数据集 来进行测试
def createDataSet():
    dataSet = [[1, 1, 'yes'],
                     [1, 1, 'yes'],
                     [1, 0, 'no'],
                     [0, 1, 'no'],
                     [0, 1, 'no']]
    labels = ['no surfacing','flippers'] # 属性值列表
    #change to discrete values
    return dataSet, labels


## 计算给定数据集的熵（混乱度） sum（概率 * （- log2(概率）））
def calcShannonEnt(dataSet):
    numEntries = len(dataSet) # 数据集 样本总数
    labelCounts = {}          # 标签集合 字典 标签 对应 出现的次数
    # 计算 类标签 各自出现的次数
    for featVec in dataSet:   # 每个样本
        currentLabel = featVec[-1]     # 每个样本的最后一列为标签
        # 若当前标签值不存在，则扩展字典，加入当前关键字
        if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0 
        labelCounts[currentLabel] += 1 # 对应标签 计数 + 1
    # 计算信息熵
    shannonEnt = 0.0   
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries  # 计算每类出现的概率
        shannonEnt -= prob * log(prob,2)           # 计算信息熵
    return shannonEnt
  
  
##  按照给定特征划分数据集 （取 划分属性值列 的剩余 部分）
def splitDataSet(dataSet, axis, value):
    # dataSet 数据集   axis划分的属性（哪一列）   对应属性（axis）值value 的那一行要去掉
    retDataSet = []         # 划分好的数据集
    for featVec in dataSet: # 每一行 即一个 样本
        if featVec[axis] == value:                  #  选取 符合 对应属性值 的列
            reducedFeatVec = featVec[:axis]         # 对应属性值 之前的其他属性值
            reducedFeatVec.extend(featVec[axis+1:]) # 加入 对应属性值 之前后的其他属性值  成一个 列表
            retDataSet.append(reducedFeatVec)       # 将其他部分 加入新 的列表里
    return retDataSet
 


## 选取最好的 划分属性值
def chooseBestFeatureToSplit(dataSet):
    numFeatures = len(dataSet[0]) - 1       # 总特征维度， 最后一列是标签
    baseEntropy = calcShannonEnt(dataSet)   # 计算原来数据集的 信息熵
    bestInfoGain = 0.0; bestFeature = -1    # 信息增益 和 最优划分属性初始化
    
    for i in range(numFeatures):            # 对于所有的特征（每一列特征对应一个 属性，即对已每一个属性）
        featList = [example[i] for example in dataSet] # 列表推导 所有 对应 特征属性
        uniqueVals = set(featList)          # 从列表创建集合 得到每（列）个属性值的集合 用于划分集合
        newEntropy = 0.0
        for value in uniqueVals:            # 对于该属性  的每个 属性值
            subDataSet = splitDataSet(dataSet, i, value)    # 选取对应属性对应属性值 的新集合
            prob = len(subDataSet)/float(len(dataSet))      # 计算 该属性下该属性值的样本所占总样本数 的比例
            newEntropy += prob * calcShannonEnt(subDataSet) # 比例 * 对于子集合的信息熵，求和得到总信息熵  
        infoGain = baseEntropy - newEntropy                 # 原始集合信息熵 - 新划分子集信息熵和  得到信息增益
        if (infoGain > bestInfoGain):       # 信息熵 比划分前 减小了吗？ 减小的话 （信息增益增大）
            bestInfoGain = infoGain         # 更新最优 信息熵
            bestFeature = i                 # 记录当前最优 的划分属性
    return bestFeature                      # 返回全局最有的划分属性


# 统计样本集合 类出现的次数 返回出现最多的 分类名称
def majorityCnt(classList):
    classCount={}
    for vote in classList: # 每一个样本
        if vote not in classCount.keys(): classCount[vote] = 0 # 增加类标签到字典中
        classCount[vote] += 1                                  # 统计次数
    sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)# 按类出现次数 从大到小排序
    return sortedClassCount[0][0]                              # 返回出现次数最多的


# 输入数据集 和 属性标签 生成 决策树
def createTree(dataSet,labels):
    #copy_labels = labels
    classList = [example[-1] for example in dataSet]    # 每个样本的分类标签
    # 终止条件1 所有类标签完全相同
    if classList.count(classList[0]) == len(classList): 
        return classList[0]           # 返回该类标签（分类属性）
    # 终止条件2 遍历完所有特征
    if len(dataSet[0]) == 1: 
        return majorityCnt(classList) # 返回出现概率最大的类标签
    # 选择最好的划分属性（划分特征）
    bestFeat = chooseBestFeatureToSplit(dataSet)
    # 对于特征值（属性值）的特征（属性）
    bestFeatLabel = labels[bestFeat]
    # 初始化树
    myTree = {bestFeatLabel:{}} # 树的形状 分类属性：子树
    del(labels[bestFeat]) # 有问题 改变了 原来属性序列
    # 根据最优的划分属性 的 值列表 创建子树
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:          # 最优的划分属性 的 值列表
        subLabels = labels[:]         # 每个子树的 子属性标签
        # 递归调用 生成决策树
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
    return myTree                            


# 使用训练好的决策树做识别分类   
def classify(inputTree,featLabels,testVec):
    firstStr = inputTree.keys()[0]    # 起始分类属性  节点
    secondDict = inputTree[firstStr]  # 子树
    featIndex = featLabels.index(firstStr)  # 属性标签索引
   # key = testVec[featIndex]               # 测试属性值向量 起始分类属性 对应 的属性
   # valueOfFeat = secondDict[key]          # 对应的子树 或 叶子节点
   # if isinstance(valueOfFeat, dict):      # 子树还是 树
   #     classLabel = classify(valueOfFeat, featLabels, testVec) #递归调用
   # else: classLabel = valueOfFeat         # 将叶子节点的标签赋予 类标签 输出
    for key in secondDict.keys():
        if testVec[featIndex] == key:
            if type(secondDict[key]).__name__ =='dict':      # 子树还是 树
                classLabel = classify(secondDict[key], featLabels, testVec) #递归调用
            else: classLabel = secondDict[key]         # 将叶子节点的标签赋予 类标签 输出
            
    return classLabel

# 使用 pickle 对象 存储决策数
def storeTree(inputTree,filename):
    import pickle
    fw = open(filename,'w')
    pickle.dump(inputTree,fw)
    fw.close()

# 使用 pickle 对象 载入决策树
def grabTree(filename):
    import pickle
    fr = open(filename)
    return pickle.load(fr)


# 使用佩戴眼镜数据测试

def test_dt():
    print '载入数据 lenses.txt ...'
    fr = open('lenses.txt')
    lenses = [inst.strip().split('\t') for inst in fr.readlines()]
    lenses_lab  = ['age','prescript','astigmatic','teatRate']
    print '创建 lenses  决策数...'
    lenses_tree = createTree(lenses,lenses_lab)
    print lenses_tree
    print '保存 lenses  决策数...'
    storeTree(lenses_tree,'lenses_tree.txt')
    print '可视化 lenses  决策数...'
    pt.createPlot(lenses_tree)

决策数可视化代码：

plot_deci_tree.py
# -*- coding:utf-8 -*-
#!/usr/bin/python
'''
绘制决策树生成的树类型字典
'''
import matplotlib.pyplot as plt


# 文本框类型 和 箭头类型
decisionNode = dict(boxstyle="sawtooth", fc="0.8") # 决策节点 文本框类型 花边矩形
leafNode = dict(boxstyle="round4", fc="0.8")       # 叶子节点 文本框类型 倒角矩形
arrow_args = dict(arrowstyle="<-")                 # 箭头类型

# 得到 叶子节点总树，用以确定 图的横轴长度
def getNumLeafs(myTree):
    # myTree = {bestFeatLabel:{}} # 树的形状 分类属性：子树  {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
    numLeafs = 0
    firstStr = myTree.keys()[0]   # 开始 节点 分类属性
    secondDict = myTree[firstStr] # 对应后面的子节点（子字典）
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':   # 子节点 是字典 （孩子节点） 循环调用
            numLeafs += getNumLeafs(secondDict[key]) #
        else:   numLeafs +=1                         # 子节点的 如果不是 字典（孩子节点），就是叶子节点
    return numLeafs

# 得到 树的深度(层数)，用以确定 图的纵轴长度
def getTreeDepth(myTree):
    maxDepth = 0
    firstStr = myTree.keys()[0]   # 开始 节点 分类属性
    secondDict = myTree[firstStr] # 对应后面的子节点（子字典）
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':         # 子节点 是字典 （孩子节点） 循环调用
            thisDepth = 1 + getTreeDepth(secondDict[key])  # 
        else:   thisDepth = 1                              # 深度为1
        if thisDepth > maxDepth: maxDepth = thisDepth
    return maxDepth

# 绘制节点 带箭头的注释   框内文本  起点  终点   框类型
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
    createPlot.ax1.annotate(nodeTxt, xy=parentPt,  xycoords='axes fraction',
             xytext=centerPt, textcoords='axes fraction',
             va="center", ha="center", bbox=nodeType, arrowprops=arrow_args )


# 在 父子节点中间连线的线上 添加文本信息（属性分类值）
#                起点     终点      文本信息
def plotMidText(cntrPt, parentPt, txtString):
    # 计算中点位置（文本放置的位置）
    xMid = (parentPt[0]-cntrPt[0])/2.0 + cntrPt[0]
    yMid = (parentPt[1]-cntrPt[1])/2.0 + cntrPt[1]                            
    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30) # 偏转角度 初始位置为90 度


# 画树
def plotTree(myTree, parentPt, nodeTxt):# if the first key tells you what feat was split on
    numLeafs = getNumLeafs(myTree)      # 树的宽度（图的横轴坐标，叶子节点的数量）
    depth = getTreeDepth(myTree)        # 树的高度（图的纵坐标，  树的层数）
    firstStr = myTree.keys()[0]         # 父节点，开始节点信息（分裂属性）
    cntrPt = (plotTree.xOff + (1.0 + float(numLeafs))/2.0/plotTree.totalW, plotTree.yOff)# 起点
    plotMidText(cntrPt, parentPt, nodeTxt)               # 线上注释
    plotNode(firstStr, cntrPt, parentPt, decisionNode)   # 画分类属性节点 和 箭头
    secondDict = myTree[firstStr]                        # 后面的子树，子字典
    plotTree.yOff = plotTree.yOff - 1.0/plotTree.totalD
    for key in secondDict.keys():
        if type(secondDict[key]).__name__=='dict':           # 子字典内还有字典
            plotTree(secondDict[key],cntrPt,str(key))        # 递归调用画子树
        else:                                                # 画叶子节点
            plotTree.xOff = plotTree.xOff + 1.0/plotTree.totalW
            plotNode(secondDict[key], (plotTree.xOff, plotTree.yOff), cntrPt, leafNode)
            plotMidText((plotTree.xOff, plotTree.yOff), cntrPt, str(key))
    plotTree.yOff = plotTree.yOff + 1.0/plotTree.totalD
#if you do get a dictonary you know it's a tree, and the first element will be another dict


# 画树
def createPlot(inTree):
    fig = plt.figure(1, facecolor='white') # 图1 背景白色
    fig.clf()                              # 清空显示
    axprops = dict(xticks=[], yticks=[])   # 标尺
    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)    # no ticks
    #createPlot.ax1 = plt.subplot(111, frameon=False)              # ticks for demo puropses 
    plotTree.totalW = float(getNumLeafs(inTree))  # 叶子节点总数（以便确定图的横轴长度）
    plotTree.totalD = float(getTreeDepth(inTree)) # 树的深度（层树）（以便确定 纵轴长度）
    plotTree.xOff = -0.5/plotTree.totalW; plotTree.yOff = 1.0;
    plotTree(inTree, (0.5,1.0), '')
    plt.show()
    
# 带箭头的含有文本框的  图
def plot_tree_demo():
    fig = plt.figure(1, facecolor='white')
    fig.clf()
    createPlot.ax1 = plt.subplot(111, frameon=False) #ticks for demo puropses 
    plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)
    plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
    plt.show()

# 事先存储一个 树信息
def retrieveTree(i):
    listOfTrees =[{'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}},
                  {'no surfacing': {0: 'no', 1: {'flippers': {0: {'head': {0: 'no', 1: 'yes'}}, 1: 'no'}}}}
                  ]
    return listOfTrees[i]

#createPlot(thisTree)