【Python】实现决策树算法和贝叶斯算法(附源代码)

1、贝叶斯算法

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
import collections
import math
import pickle
import operator

outlook = ["晴朗", "多云","雨天"]
Temperature = ["高温", "中温","低温"]
Humidity = ["高湿","一般"]
Wind = ["大", "小"]
PlayTennis=["是","否"]
Play = []
Play.append(outlook)
Play.append(Temperature)
Play.append(Humidity)
Play.append(Wind)
Play.append(PlayTennis)

#数据集
data = [  ["晴朗","高温","高湿","小","否"],
          ["晴朗","高温","高湿","大","否"],
          ["多云","高温","高湿","小","是"],
          ["雨天","中温","高湿","小","是"],
          ["雨天","低温","一般","小","是"],
          ["雨天","低温","一般","大","否"],
          ["多云","低温","一般","大","是"],
          ["晴朗","中温","高湿","小","否"],
          ["晴朗","低温","一般","小","是"],
          ["雨天","中温","一般","小","是"],
          ["晴朗","中温","一般","大","是"],
          ["多云","中温","高湿","大","是"],
          ["晴朗","高温","一般","小","是"],
          ["多云", "高温", "一般", "小", "是"],
          ["雨天","中温","高湿","大","否"],
          ["晴朗","中温","高湿","大","否"]
          ]

length = len(data)
#划分数据集,将1-12条作为训练数据集,13-16作为测试数据集
train = data[:12]
train_length = len(train)
print("训练数据集")
for i in range(train_length):
    print(train[i])
test= data[12:]
test_length = len(test)
print("测试数据集")
for i in range(test_length):
    print(test[i])
def count_PlayTennis_total(data):
    count = defaultdict(int)
    for i in range(train_length):
        count[data[i][4]]+=1
    return count

#先验概率
def cal_base_rates(data):
    y = count_PlayTennis_total(data)
    cal_base_rates = {}
    for label in y.keys():
        priori_prob = (y[label]+1) / (len(train)+2)
        cal_base_rates[label] = priori_prob
    return cal_base_rates

print(cal_base_rates(train))

def count_sj(attr, Play):
    for i in range(len(Play)):
        if attr in Play[i]:
            return len(Play[i])

#似然概率p(x|y) 也叫条件概率
def likelihold_prob(data):
    #计算各个特征值在已知结果下的概率(likelihood probabilities)
    y = count_PlayTennis_total(data)
    likelihold = {}
    for i,c in y.items():
        #创建一个临时的字典,临时存储各个特征值的概率
        attr_prob = defaultdict(int)
        for j in range(train_length):
            if data[j][4]==i:
                for attr in range(4):
                    attr_prob[data[j][attr]]+=1
        for keys,values in attr_prob.items():
            sj =  count_sj(keys, Play)
            attr_prob[keys]=(values+1)/(c+sj)
        likelihold[i] = attr_prob
    return likelihold

LikeHold = likelihold_prob(train)

def Test(data,test):
    y = count_PlayTennis_total(data)
    likehold = likelihold_prob(data)
    playtennis = cal_base_rates(data)
    RATE = defaultdict(int)
    print(test)
    for i, _ in y.items():
        rates=1
        for j in range(4):
            attr = test[j]
            rates *= likehold[i][attr]
        rates=rates * playtennis[i]
        RATE[i] = rates
    print("预测结果: " )
    print(RATE)
    return sorted(RATE,key=lambda x:RATE[x])[-1]


#先验概率
cal_base_rates(train)
# 条件概率
likelihold_prob(train)
Test(train,test[0][:4])
Test(train,test[1][:4])
Test(train,test[2][:4])
Test(train,test[3][:4])

在这里插入图片描述

2、决策树算法

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict
import collections
import math
import pickle
import operator

# 定义属性值
outlook = ["晴朗", "多云", "雨天"]
Temperature = ["高温", "中温", "低温"]
Humidity = ["高湿", "一般"]
Wind = ["大", "小"]
PlayTennis = ["是", "否"]
Play = []
Play.append(outlook)
Play.append(Temperature)
Play.append(Humidity)
Play.append(Wind)
Play.append(PlayTennis)

# 数据集
data = [["晴朗", "高温", "高湿", "小", "否"],
        ["晴朗", "高温", "高湿", "大", "否"],
        ["多云", "高温", "高湿", "小", "是"],
        ["雨天", "中温", "高湿", "小", "是"],
        ["雨天", "低温", "一般", "小", "是"],
        ["雨天", "低温", "一般", "大", "否"],
        ["多云", "低温", "一般", "大", "是"],
        ["晴朗", "中温", "高湿", "小", "否"],
        ["晴朗", "低温", "一般", "小", "是"],
        ["雨天", "中温", "一般", "小", "是"],
        ["晴朗", "中温", "一般", "大", "是"],
        ["多云", "中温", "高湿", "大", "是"],
        ["晴朗", "高温", "一般", "小", "是"],
        ["多云", "高温", "一般", "小", "是"],
        ["雨天", "中温", "高湿", "大", "否"],
        ["晴朗", "中温", "高湿", "大", "否"]
        ]

length = len(data)
# 划分数据集,将1-12条作为训练数据集,13-16作为测试数据集
train = data[:12]
train_length = len(train)
print("训练数据集")
for i in range(train_length):
    print(train[i])
test = data[12:]
test_length = len(test)
print("测试数据集")
for i in range(test_length):
    print(test[i])


# 计算信息熵
def cal_entropy(dataset):
    length = len(dataset)
    entropy = 0
    count = {}
    for i in dataset:
        label = i[-1]
        count[label] = count.get(label, 0) + 1
    for key in count:
        p = count[key] / length
        entropy = entropy - p * math.log(p, 2)
    return entropy


# 划分数据集
def splitDataSet(dataSet, axis, value):
    childDataSet = []
    for i in dataSet:
        if i[axis] == value:
            childList = i[:axis]
            childList.extend(i[axis + 1:])
            childDataSet.append(childList)
    # print(childDataSet)
    return childDataSet


# 选择最好的特征
def chooseFeature(dataset):
    old_entropy = cal_entropy(dataset)
    character = -1
    for i in range(len(dataset[0]) - 1):
        newEntropy = 0
        featureList = [word[i] for word in dataset]
        value = set(featureList)
        for j in value:
            childDataSet = splitDataSet(dataset, i, j)
            newEntropy += len(childDataSet) / len(dataset) * cal_entropy(childDataSet)
        if (newEntropy < old_entropy):
            character = i
            old_entropy = newEntropy
    return character


# 当遍历完所有特征时,用于选取当前数据集中最多的一个类别代表该类别
def most(classList):
    classCount = {}
    for i in range(len(classList)):
        classCount[i] = classCount.get(i, 0) + 1
    sortCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)
    # print(sortCount)
    return sortCount[0][0]


# 构造决策树
def createDT(dataSet, labels):
    # print(dataSet)
    tempLabels = labels[:]
    classList = [word[-1] for word in dataSet]
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    if len(dataSet[0]) == 1:
        return most(dataSet)
    character = chooseFeature(dataSet)
    node = tempLabels[character]
    myTree = {node: {}}
    del (tempLabels[character])
    featureList = [word[character] for word in dataSet]
    value = set(featureList)
    for i in value:
        newLabels = tempLabels
        myTree[node][i] = createDT(splitDataSet(dataSet, character, i), newLabels)
    return myTree


# 分类
def classify(dTree, labels, testData):
    node = list(dTree.keys())[0]
    condition = dTree[node]
    labelIndex = labels.index(node)
    classLabel = None
    print(testData)
    for key in condition:
        if testData[labelIndex] == key:
            if type(condition[key]).__name__ == 'dict':
                # print("预测结果: " )
                classLabel = classify(condition[key], labels, testData)
            else:
                print("预测结果: ")
                classLabel = condition[key]
    return classLabel


# 用于将构建好的决策树保存,方便下次使用
def stroeTree(myTree, filename):
    f = open(filename, 'wb')
    pickle.dump(myTree, f)
    f.close()


# 载入保存的决策树
def loadTree(filename):
    f = open(filename, 'rb')
    return pickle.load(f)


labels = ['天气状况', '温度', '湿度', '风力', '是否适合游玩(预测变量)']
myTree = createDT(train, labels)
stroeTree(myTree, '1')
myTree = loadTree('1')
print(myTree)

在这里插入图片描述

3、两种算法比较

决策树算法和贝叶斯算法都是常见的机器学习算法,用于分类和预测任务,但它们的工作原理和应用场景有所不同。以下是它们之间的比较:

决策树算法贝叶斯算法
工作原理基于对输入数据集进行递归分割,每次分割都基于某个特征的某个阈值,以最小化节点的不纯度(如基尼系数、信息增益等)。这种递归分割形成了一个树形结构,每个叶子节点代表一个类别或一个预测结果。基于贝叶斯定理,利用已知的数据集和特征之间的关系来计算给定一个新样本属于每个类别的概率。它假设特征之间相互独立,然后利用贝叶斯公式计算后验概率,从而进行分类。
应用场景适用于具有离散和连续特征的分类问题,特别是当特征之间存在复杂关系时。它易于理解和解释,并且对异常值和缺失值有较好的鲁棒性。适用于处理高维数据和文本分类等问题。它在样本量较少时表现良好,并且对噪声数据有一定的鲁棒性
优点易于理解、处理离散和连续数据、对异常值和缺失值具有鲁棒性容易过拟合、对数据分布敏感以及不稳定性
缺点对小样本数据有效、对噪声数据有一定的鲁棒性、能够处理高维数据对特征之间相互独立的假设较强、在特征相关性较高时表现不佳
模型解释易于解释和可视化,可以通过树结构直观地理解每个决策的依据通常较为复杂,不太容易直接解释,因为它涉及到对多个特征之间的概率关系进行建模
  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值