朴素贝叶斯

最新推荐文章于 2020-10-23 22:58:18 发布

chenlang_lbs

最新推荐文章于 2020-10-23 22:58:18 发布

阅读量175

点赞数

分类专栏：机器学习

本文链接：https://blog.csdn.net/chenlang_lbs/article/details/78982813

版权

机器学习专栏收录该内容

3 篇文章 0 订阅

订阅专栏

#!/usr/bin/env python3
# -*- coding:utf-8 -*-
__author__ = 'lbs'

import numpy as np
from collections import Counter

'''
说明: 这里只是对于X是离散型变量做的,对于连续型变量需要计算其相应的概率,对于既有连续型变量又有离散型变量需要分开计算...
'''


class NaiveBayes(object):

    def __init__(self, train_set):
        self.eps = np.spacing(1)
        self.train_set = train_set
        self.x1_set = set([tr[0][0] for tr in self.train_set])
        self.x2_set = set([tr[0][1] for tr in self.train_set])
        self.y_labels =  Counter([tr[1] for tr in self.train_set]).items()

    def priori_probability(self):
        train_set_len = len(self.train_set)
        priori_prob = {str(label): (count / train_set_len + self.eps) for label, count in self.y_labels}
        return priori_prob

    def conditional_probability(self):
        y_conditional_probs = {}
        for yk, yv in self.y_labels:
            for x1 in self.x1_set:
                y_conditional_probs.update({"x1_"+str(x1)+"_"+str(yk):
                                                len(list(filter(lambda x: x[0][0]==x1 and x[1]==yk,
                                                                self.train_set)))/yv + self.eps})
            for x2 in self.x2_set:
                y_conditional_probs.update({"x2_" + str(x2) + "_" + str(yk):
                                                len(list(filter(lambda x: x[0][1] == x2 and x[1] == yk,
                                                                self.train_set)))/yv + self.eps})

        return y_conditional_probs

    def predict(self, input_data):
        # input_data = [2, "S"]  # 单次查询
        priori_prob = self.priori_probability()
        conditional_prob = self.conditional_probability()

        posterior_probs = {}
        for y_label, prob in priori_prob.items():
            tmp_prob = []
            for k, v in conditional_prob.items():
                if k.split("_")[-1] == str(y_label):
                    if k.split("_")[0] == "x1" and k.split("_")[1] == str(input_data[0]):
                        tmp_prob.append(v)
                    if k.split("_")[0] == "x2" and k.split("_")[1] == str(input_data[1]):
                        tmp_prob.append(v)
                    else:
                        pass
            multiply_prob = prob * np.prod(tmp_prob)
            posterior_probs.update({str(y_label):multiply_prob})
        # print("posterior_probs: ", posterior_probs)
        # output predict label
        max_posterior_prob_label = max(posterior_probs.items(), key=lambda x: x[1])[0]
        return max_posterior_prob_label


if __name__ == '__main__':
    train_data = [([1, "S"], -1), ([1, "M"], -1), ([1, "M"], 1), ([1, "S"], 1), ([1, "S"], -1),
                 ([2, "S"], -1), ([2, "M"], -1), ([2, "M"], 1), ([2, "L"], 1), ([2, "L"], 1),
                 ([3, "L"], 1), ([3, "M"], 1), ([3, "M"], 1), ([3, "L"], 1), ([3, "L"], -1)]
    naive_bayes = NaiveBayes(train_set=train_data)

    test_sample = [2, "S"]
    result = naive_bayes.predict(input_data=test_sample)
    print("result: ", result)