机器学习-决策树(一)(李航统计学习方法)

前言

本文主要对书中第五章中的例5.3,ID3算法和c4.5算法进行代码实现(文末有完整代码)

一、导入需要的库

from math import log

二、读取数据

def loadDataSet():
    dataSet = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否']]
    label = ['年龄', '有工作', '有自己的房子', '信贷情况']
    return dataSet, label

三、创建树

def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if len(set(classList)) == 1:  # 单节点
        return classList[0]
    if len(dataSet[0]) == 0:       # 特征集A为空集
        return classmax(classList)# 将D中实例数最大的类作为该节点的类标记
    bestFeat = chooseBestFeatureSplit(dataSet)
    bestFeatLabel = labels[bestFeat]

    # 创建树:
    myTree = {bestFeatLabel: {}}
    del (labels[bestFeat])

    featValues = [example[bestFeat] for example in dataSet]
    for value in set(featValues):
        subLabels = labels[:]
        subDataSet = splitData(dataSet, bestFeat, value)
        myTree[bestFeatLabel][value] = createTree(subDataSet, subLabels)
    return myTree

四、将数据集D中实例数最大的类作为该节点的类标记

def classmax(classList):
    classCount = dict([(i, classList.count(i)) for i in classList])
    return max(classCount, key=lambda x: classCount[x])

五、计算数据集D的经验熵

def entropy(dataSet):
    labelCount = {}
    D = len(dataSet)
    for data in dataSet:
        classify = data[-1]
        if classify not in labelCount.keys():
            labelCount[classify] = 1
        else:
            labelCount[classify] += 1
    H = 0.0
    for CK in labelCount.values():
        pi = CK / D
        H -= pi * log(pi, 2)
    return H

六、返回不同value的数据

def splitData(dataSet, axis, value):
    retDataSet = []
    for data in dataSet:
        if data[axis] == value:
            reducedData = data[:axis]
            reducedData.extend(data[axis + 1:])
            retDataSet.append(reducedData)
    print(axis,value)
    print(retDataSet)
    return retDataSet

七、选择信息增益最大的特征为最优特征

ID3算法 (用信息增益选择特征)

def chooseBestFeatureSplit(dataSet):
    HD = entropy(dataSet)
    bestGain = 0.0
    bestFeature = -1
    for i in range(len(dataSet[0]) - 1):
        feat = [data[i] for data in dataSet]
        prob = 0.0
        for value in set(feat):
            subData = splitData(dataSet, i, value)
            pi = len(subData) / len(dataSet)  # Di/D
            prob += pi * entropy(subData)  # 经验条件熵
        Gain = (HD - prob)#信息增益
        if Gain > bestGain:
            bestGain = Gain
            bestFeature = i
    # print('最优特征是:', label[bestFeature])
    return bestFeature

c4.5算法(用信息增益比选择特征)

def chooseBestFeatureSplit(dataSet):
    HD = entropy(dataSet)
    bestGain = 0.0
    bestFeature = -1
    for i in range(len(dataSet[0]) - 1):
        feat = [data[i] for data in dataSet]
        # print(feat)
        prob = 0.0
        HAD = 0.0
        for value in set(feat):
            subData = splitData(dataSet, i, value)
            pi = len(subData) / len(dataSet)  # Di/D
            prob += pi * entropy(subData)#经验条件熵
            HAD -= (pi) * log(pi, 2)
        Gain = (HD - prob) / HAD      #信息增益比
        if Gain > bestGain:
            bestGain = Gain
            bestFeature = i
    # print('最优特征是:', label[bestFeature])
    return bestFeature

八、执行

dataSet, label = loadDataSet()
print(createTree(dataSet, label))

完整代码

ID3算法

from math import log
def loadDataSet():
    dataSet = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否']]
    label = ['年龄', '有工作', '有自己的房子', '信贷情况']
    return dataSet, label


def classmax(classList):
    classCount = dict([(i, classList.count(i)) for i in classList])
    return max(classCount, key=lambda x: classCount[x])


def entropy(dataSet):
    labelCount = {}
    D = len(dataSet)
    for data in dataSet:
        classify = data[-1]
        if classify not in labelCount.keys():
            labelCount[classify] = 1
        else:
            labelCount[classify] += 1
    H = 0.0
    for CK in labelCount.values():
        pi = CK / D
        H -= pi * log(pi, 2)
    return H


def splitData(dataSet, axis, value):
    retDataSet = []
    for data in dataSet:
        if data[axis] == value:
            reducedData = data[:axis]
            reducedData.extend(data[axis + 1:])
            retDataSet.append(reducedData)
    print(axis,value)
    print(retDataSet)
    return retDataSet


def chooseBestFeatureSplit(dataSet):
    HD = entropy(dataSet)
    bestGain = 0.0
    bestFeature = -1
    for i in range(len(dataSet[0]) - 1):
        feat = [data[i] for data in dataSet]
        prob = 0.0
        for value in set(feat):
            subData = splitData(dataSet, i, value)
            pi = len(subData) / len(dataSet)  # Di/D
            prob += pi * entropy(subData)  # 经验条件熵
        Gain = (HD - prob)#信息增益
        if Gain > bestGain:
            bestGain = Gain
            bestFeature = i
    # print('最优特征是:', label[bestFeature])
    return bestFeature


def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if len(set(classList)) == 1:  # 单节点
        return classList[0]
    if len(dataSet[0]) == 0:       # 特征集A为空集
        return classmax(classList)# 将D中实例数最大的类作为该节点的类标记
    bestFeat = chooseBestFeatureSplit(dataSet)
    bestFeatLabel = labels[bestFeat]

    # 创建树:
    myTree = {bestFeatLabel: {}}
    del (labels[bestFeat])

    featValues = [example[bestFeat] for example in dataSet]
    for value in set(featValues):
        subLabels = labels[:]
        subDataSet = splitData(dataSet, bestFeat, value)
        myTree[bestFeatLabel][value] = createTree(subDataSet, subLabels)
    return myTree


dataSet, label = loadDataSet()
print(createTree(dataSet, label))
#结果{'有自己的房子': {'是': '是', '否': {'有工作': {'是': '是', '否': '否'}}}}

c4.5算法

from math import log
def loadDataSet():
    dataSet = [['青年', '否', '否', '一般', '否'],
               ['青年', '否', '否', '好', '否'],
               ['青年', '是', '否', '好', '是'],
               ['青年', '是', '是', '一般', '是'],
               ['青年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '一般', '否'],
               ['中年', '否', '否', '好', '否'],
               ['中年', '是', '是', '好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['中年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '非常好', '是'],
               ['老年', '否', '是', '好', '是'],
               ['老年', '是', '否', '好', '是'],
               ['老年', '是', '否', '非常好', '是'],
               ['老年', '否', '否', '一般', '否']]
    label = ['年龄', '有工作', '有自己的房子', '信贷情况']
    return dataSet, label


def classmax(classList):
    classCount = dict([(i, classList.count(i)) for i in classList])
    return max(classCount, key=lambda x: classCount[x])


def entropy(dataSet):   #HD
    labelCount = {}
    D = len(dataSet)
    for data in dataSet:
        classify = data[-1]
        if classify not in labelCount.keys():
            labelCount[classify] = 1
        else:
            labelCount[classify] += 1
    H = 0.0
    for CK in labelCount.values():
        pi = CK / D
        H -= pi * log(pi, 2)
    return H


def splitData(dataSet, axis, value):
    retDataSet = []
    for data in dataSet:
        if data[axis] == value:
            reducedData = data[:axis]
            reducedData.extend(data[axis + 1:])
            retDataSet.append(reducedData)
    return retDataSet


def chooseBestFeatureSplit(dataSet):
    HD = entropy(dataSet)
    bestGain = 0.0
    bestFeature = -1
    for i in range(len(dataSet[0]) - 1):
        feat = [data[i] for data in dataSet]
        # print(feat)
        prob = 0.0
        HAD = 0.0
        for value in set(feat):
            subData = splitData(dataSet, i, value)
            pi = len(subData) / len(dataSet)  # Di/D
            prob += pi * entropy(subData)#经验条件熵
            HAD -= (pi) * log(pi, 2)
        Gain = (HD - prob) / HAD      #信息增益比
        if Gain > bestGain:
            bestGain = Gain
            bestFeature = i
    # print('最优特征是:', label[bestFeature])
    return bestFeature


def createTree(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    if len(set(classList)) == 1:  # 单节点
        return classList[0]
    if len(dataSet[0]) == 0:       # 特征集A为空集
        return classmax(classList)# 将D中实例数最大的类作为该节点的类标记
    bestFeat = chooseBestFeatureSplit(dataSet)
    bestFeatLabel = labels[bestFeat]

    # 创建树:
    myTree = {bestFeatLabel: {}}
    del (labels[bestFeat])

    featValues = [example[bestFeat] for example in dataSet]
    for value in set(featValues):
        subLabels = labels[:]
        subDataSet = splitData(dataSet, bestFeat, value)
        myTree[bestFeatLabel][value] = createTree(subDataSet, subLabels)
    return myTree


dataSet, label = loadDataSet()
print(createTree(dataSet, label))
#结果{'有自己的房子': {'是': '是', '否': {'有工作': {'是': '是', '否': '否'}}}}
  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

.别拖至春天.

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值