机器学习实战之决策树(不带剪枝)分类算分享交流

最新推荐文章于 2022-03-09 12:04:09 发布

柚子咩

最新推荐文章于 2022-03-09 12:04:09 发布

阅读量453

点赞数

分类专栏： Python 文章标签：决策树

本文链接：https://blog.csdn.net/qq_34454366/article/details/104532323

版权

这篇博客分享了使用决策树进行预测的实践，针对《机器学习实战》中的决策树例子进行了代码调整，以适应Python新版本。作者已测试过预测联系镜片类型的代码，运行正常，并表示后续会继续更新内容，欢迎大家在评论区提问交流。

摘要由CSDN通过智能技术生成

整理并分析了里面一些错误和不适之处，已经在代码中修改，原因是Python版本的更新，目前只是基础部分的代码，后面例子的代码调试分析后再上传，供大家学习交流。

# -*- coding:utf-8 -*-
from math import log
import matplotlib.pyplot as plt
import operator
import pickle

decisionNode=dict(boxstyle="sawtooth",fc="0.8")#文本框类型为锯齿形
leafNode=dict(boxstyle="round4",fc="0.8")#叶节点为圆一点的四边形
arrow_args=dict(arrowstyle="<-")#箭头的类型


def createDataSet():
    dataSet=[[1,1,'yes'],
             [1,1,'yes'],
             [1,0,'no'],
             [0,1,'no'],
             [0,1,'no']]
    labels=['no surfacing','flippers']
    return dataSet,labels

def calcShannonEnt(dataSet):  # 计算数据的熵(entropy)
    numEntries=len(dataSet)  # 数据条数
    labelCounts={}#dictionary whose keys are the values in the final column
    for featVec in dataSet:
        currentLabel=featVec[-1] # 每行数据的最后一个字（类别）
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1  # 统计有多少个类以及每个类的数量
    shannonEnt=0.0
    for key in labelCounts:
        prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
        shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
    return shannonEnt

def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据
    '''
    :param dataSet:the data we'll split
    :param axis:the feature we’ll split on
    :param value:and the value of the feature to return
    Our dataset is a list of lists; you iterate over every item in the list and if it
    contains the value you’re looking for, you’ll add it to your newly created list. Inside the
    if statement, you cut out the feature that you split on.
    '''
    retDataSet=[]
    for featVec in dataSet:
        if featVec[axis]==value:
            reducedFeatVec =featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet


def chooseBestFeatureToSplit(dataSet):  # 选择最优的分类特征
    numFeatures = len(dataSet[0])-1
    baseEntropy = calcShannonEnt(dataSet)  # 原始的熵
    bestInfoGain = 0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob =len(subDataSet)/float(len(dataSet))
            newEntropy +=prob*calcShannonEnt(subDataSet)  # 按特征分类后的熵
        infoGain = baseEntropy - newEntropy  # 原始熵与按特征分类后的熵的差值
        if (infoGain>bestInfoGain):   # 若按某特征划分后，熵值减少的最大，则此特征为最优分类特征
            bestInfoGain=infoGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList):
    '''
    return the class that occurs with the greatest frequency.
    '''
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet,labels):
    classList=[example[-1] for example in dataSet]  # 类别：男或女
    if classList.count(classList[0])==len(classList):
        return classList[0]
    if len(dataSet[0])==1:
        return majorityCnt(classList)
    bestFeat=chooseBestFeatureToSplit(dataSet) #选择最优特征
    bestFeatLabel=labels[bestFeat]
    myTree={bestFeatLabel:{}} #分类结果以字典形式保存
    del(labels[bestFeat])#删除了最佳划分特征的标签
    featValues=[example[bestFeat] for example in dataSet]
    uniqueVals=set(featValues)
    for value in uniqueVals:
        subLabels=labels[:]
        myTree[bestFeatLabel][value]=createTree(splitDataSet\
                            (dataSet,bestFeat,value),subLabels)
    return myTree
'''
Attenti