Python实现机器算法-03-贝叶斯

# -*- coding: UTF-8 -*-
"""

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@File         :   naivebayes.py    
@Contact      :   ffzzyy@126.com
@License      :   (C)Copyright 2017-2019
@Author       :   ffzzyy
@Version      :   0.1
@Modify Time  :   2019/3/15 15:04
@Desciption

"""

import numpy as np

import pandas  as  pd
from functools import reduce


def acount(x1, x2, axis=0):
    """
    统计array like:x1 中 x2的个数
    :param x1: 一个numpy ndarray
    :param x2: 一个numpy ndarray
    :param axis: 表示统计方向,默认为按列
    :return:返回一个array like

    Examples
    --------
    >>> x1=np.array([[1,"s"],[2,"l"],[1,"l"],[3,"m"]])
    >>> x2=np.array([2,"l"])
    >>> acount(x1,x2)
        [1. 2.]

    """
    result = np.zeros(len(x2))
    for i, value in enumerate(x2):
        x1_column = x1[:, i]  # numpy array 按列切片
        result[i] = np.sum(np.array(x1_column == value))  # 使用numpy bool索引进行统计个数

    return result


class NBClassifier:


    def __init__(self):
        self._x_train = None
        self._y_train = None
        self._class = None
        self._prior_proba = {}  # 字典:先验概率

    def _set_class(self):
        """
        得到列表:分类序列
        :return:
        """
        self._class = list(np.unique(self._y_train))
        self._class.sort()

    def _set_prior_proba(self):
        """
        计算先验概率
        :param class_:
        :return:
        """

        for enum in self._class:
            count_ = list(self._y_train).count(enum)
            self._prior_proba[enum] = count_ / len(self._y_train)



    def _get_condi_proba(self, x, c):
        """
        计算P(X | C)的条件概率
        :param x:
        :param c:
        :return: 列表
        """

        y_train_index = [i for i, value in enumerate(self._y_train) if value == c]
        x_train_eq_c = self._x_train[y_train_index]
        condi_proba = acount(x_train_eq_c, x) / len(y_train_index)
        """
        发现为0的概率的时候,通过那普拉斯修正
        """
        if 0 in condi_proba:
            for i, value in enumerate(condi_proba):
                if value==0:
                    # 得到该 特征 的字可能取值数
                    proba_value_count_i=len(list(np.unique(self._x_train[:,i])))
                    condi_proba[i]=1/(len(y_train_index)+proba_value_count_i)


        return reduce(lambda x, y: x * y, condi_proba)

    def fit(self, x_train, y_train):
        """
        训练函数
        :param x_train:
        :param y_train:
        :return:
        """
        self._x_train = x_train
        self._y_train = y_train
        self._class = None
        self._prior_proba = {}  # 字典:先验概率
        self._set_class()
        self._set_prior_proba()

    def _predict(self, x):
        """
        针对单个训练元组进行预测
        :param x:
        :return:
        """
        result = {}
        for enum in self._class:
            result[enum] = self._get_condi_proba(x, enum)

        for enum in self._class:
            result[enum] = result[enum] * self._prior_proba[enum]
        return result

    def predict(self, X_predict):
        """
        预测函数
        :param X_predict:ndarray like
        :return:
        """
        y_predict = [self._predict(x) for x in X_predict]
        return np.array(y_predict)



def load_data(file_path):
    """
    从文件中得到训练集
    :param file_path:
    :return:
    """
    df = pd.read_csv(file_path, encoding='cp936')
    csv_arr = np.array(df)
    # 最后一列是y_train
    y_train = csv_arr[:, csv_arr.shape[1] - 1]
    x_train = csv_arr[:, 0:csv_arr.shape[1] - 1]
    return x_train, y_train


def main():
    file_path = "贝叶斯测试.csv"
    x_train, y_train = load_data(file_path)
    nb = NBClassifier()
    nb.fit(x_train, y_train)
    print(nb._class)
    print(nb._prior_proba)
    print(nb.predict([[2, 's']]))

    file_path = "西瓜数据.csv"
    x_train, y_train = load_data(file_path)

    # 西瓜测试数据需要删除第一列
    x_train=np.delete(x_train, 0, axis=1)
    nb = NBClassifier()
    nb.fit(x_train, y_train)
    print(nb._class)
    print(nb._prior_proba)
    print(nb.predict([['浅白','蜷缩','浊响','模糊','平坦','硬滑']]))


if __name__ == '__main__':
    main()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值