朴素贝叶斯分类器简介

最新推荐文章于 2022-11-29 22:48:43 发布

NP_hard

最新推荐文章于 2022-11-29 22:48:43 发布

阅读量383

点赞数 3

分类专栏： Machine learning

本文链接：https://blog.csdn.net/NP_hard/article/details/116068345

版权

Machine learning 专栏收录该内容

29 篇文章 11 订阅

订阅专栏

文章目录

算法推导

在这里插入图片描述

算法流程

如果没有Y的先验概率，就先计算Y的先验概率： $P(Y=C_k)=(m_k+\lambda)/(m+K\lambda)$
分别计算第k个类别的第i维特征的第j个取值的条件概率: $P(X_i=x_{ij}|Y=C_k)$
- 如果是离散值: $P(X_i=x_ij|Y=C_k)=\frac{m_kij+\lambda}{m_k+S_i\lambda}$ λ可以取值为1，或者其他大于0的数字
- 如果是稀疏的离散值 $P(X_i=x_{ij}|Y=C_k)=P(i|Y=C_k)x_{ij}+(1-P(i|Y-C_k))(1-x_{ij})$ 此时j只有两种取值
- 如果是连续值，则不需要计算各个j的取值概率，直接求正态分布的参数: $P(X_j=x_j|Y=C_k)=\frac{1}{\sqrt{2\pi\sigma^{2}_{k}}}exp(-\frac{(x_j-\mu_k)^2}{2\sigma_k^2})$ 需要求出 $\mu_k$ 和 $\sigma_k^2$ 。 $\mu_k$ 为在样本类别 $C_k$ 中，所有 $X_j$ 的平均值。 $\sigma^2_k$ 为在样本类别 $C_k$ 中，所有 $X_j$ 的方差。
对于实例 $X^{(test)}$ ，分别计算： $P(Y=C_k)\prod^p_{i=1}P(X_i=x_j^{test}|Y=C_k)$
确定实例 $X^{(test)}$ 的分类C_{result} $C_{result}=argmaxP(Y=C_k)\prod^{p}_{i=1}P(X_i=X^{(test)}_i|Y=C_k)$

手写代码实现

import os
os.chdir('../')
from ml_models import utils
from ml_models.wrapper_models import DataBinWrapper
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
#from sklearn.datasets import make_classification
import matplotlib.pyplot as plt
#from mpl_toolkits.mplot3d import Axes3D

%matplotlib inline

from sklearn.datasets.samples_generator import make_blobs
X, y = make_blobs(n_samples=400, centers=4, cluster_std=0.85, random_state=0)

#将元素的x，y左右反转
X=X[:,::-1]

X_0=np.array([X[i] for i in range(y.size) if y[i]==0])
X_1=np.array([X[i] for i in range(y.size) if y[i]==1])
X_2=np.array([X[i] for i in range(y.size) if y[i]==2])
X_3=np.array([X[i] for i in range(y.size) if y[i]==3])

fig=plt.figure(figsize=(10,6))
plt.plot(X_0[:,0],X_0[:,1],'o')
plt.plot(X_1[:,0],X_1[:,1],'o')
plt.plot(X_2[:,0],X_2[:,1],'o')
plt.plot(X_3[:,0],X_3[:,1],'o')
plt.grid(True)

在这里插入图片描述
朴素贝叶斯分类器

#朴素贝叶斯分类器
class NaiveBayesClassifier(object):
    def __init__(self,max_bins=10):
        self.discrete=DataBinWrapper(max_bins=max_bins)#变量离散化
        #模型参数
        self.default_y_prob=None #y的默认概率
        self.default_x_prob={}#x的默认概率
        self.p_y={}#p(y)
        self.p_x_y={}#p(x|y)
        self.class_num=None#C_k
        
    def fit(self,x,y):
        self.default_y_prob=np.log(0.5 / (y.max()+1))
        #对x进行离散化
        self.discrete.fit(x)
        x_bins=self.discrete.transform(x)#x_bins为离散化后的x
        self.class_num=y.max()+1#类的数量
        #参数估计,p(y)和p(x|y)，由贝叶斯公式得到
        for y_index in range(0,self.class_num):
            #p(y)
            y_n_sample=np.sum(y==y_index)#第i类样本的数量
            self.default_x_prob[y_index]=np.log(0.5/y_n_sample)
            x_y=x_bins[y==y_index]# <==> [x_y[i] for i in range(y) if y[i]==y_index]
            #p(y)=第i类样本的数量/样本数量
            self.p_y[y_index]=np.log(y_n_sample/(self.class_num+len(y)))
            self.p_x_y[y_index]={}
            #p(x|y)
            for i in range(0,x_y.shape[1]):#x的遍历维度
                self.p_x_y[y_index][i]={}
                x_i_feature_set=set(x_y[:,i])#即所有x_i（第i个维度的set）
                for x_feature in x_i_feature_set:
                    self.p_x_y[y_index][i][x_feature]=np.log(
                        np.sum(x_y[:,i]==x_feature)/(y_n_sample+len(x_i_feature_set)))
                    
    def predict_proba(self, x):
        x_bins = self.discrete.transform(x)
        rst = []
        for x_row in x_bins:
            tmp = []
            for y_index in range(0, self.class_num):
                try:
                    p_y_log = self.p_y[y_index]
                except:
                    p_y_log = self.default_y_prob
                for i,xij in enumerate(x_row):#枚举值，返回(1,x),(2,x)
                    try:
                        p_y_log += self.p_x_y[y_index][i][xij]
                    except:
                        p_y_log += self.default_x_prob[y_index]
                tmp.append(p_y_log)
            rst.append(tmp)
        return utils.softmax(np.asarray(rst))
    
    def predict(self, x):
        return np.argmax(self.predict_proba(x), axis=1)

测试

bayes_model=NaiveBayesClassifier(max_bins=20)
bayes_model.fit(X,y)
utils.plot_decision_function(X,y,bayes_model)

在这里插入图片描述
由于对特征进行了分箱处理，导致决策边界不够细腻
于是我们将每一维的特征视作从某一个一维高斯分布中抽样而来，这就是高斯朴素贝叶斯分类器

#由于对特征进行了分箱处理，导致决策边界不够细腻
#于是我们将每一维的特征视作从某一个一维高斯分布中抽样而来，这就是高斯朴素贝叶斯分类器
class GaussianNBClassifier(object):
    def __init__(self):
        self.p_y = {}  # p(y)
        self.p_x_y = {}  # p(x | y)
        self.class_num = None

    def fit(self, x, y):
        # 参数估计
        self.class_num = y.max() + 1
        for y_index in range(0, self.class_num):#遍历分类的数量
            # p(y)
            y_n_sample = np.sum(y == y_index)#C_k
            self.p_y[y_index] = np.log(y_n_sample / len(y))
            self.p_x_y[y_index] = {}
            # p(x | y)
            x_y = x[y == y_index]#第i类的样本数量
            for i in range(0, x_y.shape[1]):#遍历维度数量
                u = np.mean(x_y[:, i])#第i类，第j维的特征均值
                sigma = np.std(x_y[:, i])#方差
                self.p_x_y[y_index][i] = [u, sigma]

    def predict_proba(self, x):
        rst = []
        for x_row in x:
            tmp = []
            for y_index in range(0, self.class_num):
                p_y_log = self.p_y[y_index]
                for j in range(0, len(x_row)):
                    xij = x_row[j]
                    p_y_log += np.log(utils.gaussian_1d(xij, self.p_x_y[y_index][j][0], self.p_x_y[y_index][j][1]))
                tmp.append(p_y_log)
            rst.append(tmp)
        return utils.softmax(np.asarray(rst))

    def predict(self, x):
        return np.argmax(self.predict_proba(x), axis=1)

测试

#测试
gaussian_model = GaussianNBClassifier()
gaussian_model.fit(X, y)
utils.plot_decision_function(X, y, gaussian_model)

在这里插入图片描述
手写代码中用到的一些自写库

DataBinWrapper

import numpy as np


class DataBinWrapper(object):
    def __init__(self, max_bins=10):
        # 分段数
        self.max_bins = max_bins
        # 记录x各个特征的分段区间
        self.XrangeMap = None

    def fit(self, x):
        n_sample, n_feature = x.shape
        # 构建分段数据
        self.XrangeMap = [[] for _ in range(0, n_feature)]
        for index in range(0, n_feature):
            tmp = sorted(x[:, index])
            for percent in range(1, self.max_bins):
                percent_value = np.percentile(tmp, (1.0 * percent / self.max_bins) * 100.0 // 1)
                self.XrangeMap[index].append(percent_value)
            self.XrangeMap[index] = sorted(list(set(self.XrangeMap[index])))

    def transform(self, x):
        """
        抽取x_bin_index
        :param x:
        :return:
        """
        if x.ndim == 1:
            return np.asarray([np.digitize(x[i], self.XrangeMap[i]) for i in range(0, x.size)])
        else:
            return np.asarray([np.digitize(x[:, i], self.XrangeMap[i]) for i in range(0, x.shape[1])]).T

utils

import numpy as np
import copy
from scipy import special
import math
import matplotlib.pyplot as plt


def sign(x):
    """
    符号函数
    :param x:
    :return:
    """
    if x > 0:
        return 1
    elif x < 0:
        return -1
    else:
        return 0


def sigmoid(x2):
    """
    sigmoid函数
    :param x2:
    :return:
    """
    x = copy.deepcopy(x2)
    if type(x) is int:
        x = 20.0 if x > 20.0 else x
        x = -100.0 if x < -100.0 else x
    else:
        # 避免下溢
        x[x > 20.0] = 20.0
        # 避免上溢
        x[x < -100.0] = -100.0
    return 1 / (1 + np.exp(-x))


def softmax(x):
    """
    softmax函数
    :param x:
    :return:
    """
    if x.ndim == 1:
        return np.exp(x) / np.exp(x).sum()
    else:
        return np.exp(x) / np.exp(x).sum(axis=1, keepdims=True)


def entropy(x, sample_weight=None):
    """
    计算熵
    :param x:
    :param sample_weight:
    :return:
    """
    x = np.asarray(x)
    # x中元素个数
    x_num = len(x)
    # 如果sample_weight为None设均设置一样
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    x_counter = {}
    weight_counter = {}
    # 统计各x取值出现的次数以及其对应的sample_weight列表
    for index in range(0, x_num):
        x_value = x[index]
        if x_counter.get(x_value) is None:
            x_counter[x_value] = 0
            weight_counter[x_value] = []
        x_counter[x_value] += 1
        weight_counter[x_value].append(sample_weight[index])

    # 计算熵
    ent = .0
    for key, value in x_counter.items():
        p_i = 1.0 * value * np.mean(weight_counter.get(key)) / x_num
        ent += -p_i * math.log(p_i)
    return ent


def cond_entropy(x, y, sample_weight=None):
    """
    计算条件熵:H(y|x)
    """
    x = np.asarray(x)
    y = np.asarray(y)
    # x中元素个数
    x_num = len(x)
    # 如果sample_weight为None设均设置一样
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    # 计算
    ent = .0
    for x_value in set(x):
        x_index = np.where(x == x_value)
        new_x = x[x_index]
        new_y = y[x_index]
        new_sample_weight = sample_weight[x_index]
        p_i = 1.0 * len(new_x) / x_num
        ent += p_i * entropy(new_y, new_sample_weight)
    return ent


def muti_info(x, y, sample_weight=None):
    """
    互信息/信息增益:H(y)-H(y|x)
    """
    x_num = len(x)
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    return entropy(y, sample_weight) - cond_entropy(x, y, sample_weight)


def info_gain_rate(x, y, sample_weight=None):
    """
    信息增益比
    """
    x_num = len(x)
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    return 1.0 * muti_info(x, y, sample_weight) / (1e-12 + entropy(x, sample_weight))


def gini(x, sample_weight=None):
    """
    计算基尼系数 Gini(D)
    :param x:
    :param sample_weight:
    :return:
    """
    x_num = len(x)
    # 如果sample_weight为None设均设置一样
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    x_counter = {}
    weight_counter = {}
    # 统计各x取值出现的次数以及其对应的sample_weight列表
    for index in range(0, x_num):
        x_value = x[index]
        if x_counter.get(x_value) is None:
            x_counter[x_value] = 0
            weight_counter[x_value] = []
        x_counter[x_value] += 1
        weight_counter[x_value].append(sample_weight[index])

    # 计算gini系数
    gini_value = 1.0
    for key, value in x_counter.items():
        p_i = 1.0 * value * np.mean(weight_counter.get(key)) / x_num
        gini_value -= p_i * p_i
    return gini_value


def cond_gini(x, y, sample_weight=None):
    """
    计算条件gini系数:Gini(y,x)
    """
    x = np.asarray(x)
    y = np.asarray(y)
    # x中元素个数
    x_num = len(x)
    # 如果sample_weight为None设均设置一样
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    # 计算
    gini_value = .0
    for x_value in set(x):
        x_index = np.where(x == x_value)
        new_x = x[x_index]
        new_y = y[x_index]
        new_sample_weight = sample_weight[x_index]
        p_i = 1.0 * len(new_x) / x_num
        gini_value += p_i * gini(new_y, new_sample_weight)
    return gini_value


def gini_gain(x, y, sample_weight=None):
    """
    gini值的增益
    """
    x_num = len(x)
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    return gini(y, sample_weight) - cond_gini(x, y, sample_weight)


def square_error(x, sample_weight=None):
    """
    平方误差
    :param x:
    :param sample_weight:
    :return:
    """
    x = np.asarray(x)
    # x_mean = np.mean(x)
    x_num = len(x)
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    x_mean = np.dot(x, sample_weight / np.sum(sample_weight))
    error = 0.0
    for index in range(0, x_num):
        error += (x[index] - x_mean) * (x[index] - x_mean) * sample_weight[index]
    return error


def cond_square_error(x, y, sample_weight=None):
    """
    计算按x分组的y的误差值
    :param x:
    :param y:
    :param sample_weight:
    :return:
    """
    x = np.asarray(x)
    y = np.asarray(y)
    # x中元素个数
    x_num = len(x)
    # 如果sample_weight为None设均设置一样
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    # 计算
    error = .0
    for x_value in set(x):
        x_index = np.where(x == x_value)
        new_y = y[x_index]
        new_sample_weight = sample_weight[x_index]
        error += square_error(new_y, new_sample_weight)
    return error


def square_error_gain(x, y, sample_weight=None):
    """
    平方误差带来的增益值
    :param x:
    :param y:
    :param sample_weight:
    :return:
    """
    x_num = len(x)
    if sample_weight is None:
        sample_weight = np.asarray([1.0] * x_num)
    return square_error(y, sample_weight) - cond_square_error(x, y, sample_weight)


def gaussian_1d(x, u, sigma):
    """
    一维高斯概率分布函数
    :param x:
    :param u:
    :param sigma:
    :return:
    """
    return 1 / (np.sqrt(2 * np.pi) * sigma + 1e-12) * np.exp(-1 * np.power(x - u, 2) / (2 * sigma ** 2 + 1e-12))


def gaussian_nd(x, u, sigma):
    """
    高维高斯函数
    :param x:
    :param u:
    :param sigma:
    :return:
    """
    if x.ndim == 1:
        x = np.expand_dims(x, axis=0)
    return 1.0 / (np.power(2 * np.pi, x.shape[1] / 2) * np.sqrt(np.linalg.det(sigma))) * np.exp(
        np.sum(-0.5 * (x - u).dot(np.linalg.inv(sigma)) * (x - u), axis=1))


def dirichlet(u, alpha):
    """
    狄利克雷分布
    :param u: 随机变量
    :param alpha: 超参数
    :return:
    """
    # 计算归一化因子
    beta = special.gamma(np.sum(alpha))
    for alp in alpha:
        beta /= special.gamma(np.sum(alp))
    rst = beta
    # 计算结果
    for idx in range(0, len(alpha)):
        rst *= np.power(u[idx], alpha[idx] - 1)
    return rst


def wishart(Lambda, W, v):
    """
    wishart分布
    :param Lambda:随机变量
    :param W:超参数
    :param v:超参数
    :return:
    """
    # 维度
    D = W.shape[0]
    # 先计算归一化因子
    B = np.power(np.linalg.det(W), -1 * v / 2)
    B_ = np.power(2.0, v * D / 2) * np.power(np.pi, D * (D - 1) / 4)
    for i in range(1, D + 1):
        B_ *= special.gamma((v + 1 - i) / 2)
    B = B / B_
    # 计算剩余部分
    rst = B * np.power(np.linalg.det(Lambda), (v - D - 1) / 2)
    rst *= np.exp(-0.5 * np.trace(np.linalg.inv(W) @ Lambda))
    return rst


def St(X, mu, Lambda, v):
    """
    学生t分布
    :param X: 随机变量
    :param mu: 超参数
    :param Lambda: 超参数
    :param v: 超参数
    :return:
    """
    n_sample, D = X.shape
    return special.gamma(D / 2 + v / 2) * np.power(np.linalg.det(Lambda), 0.5) * np.power(
        1 + np.sum((X - mu) @ Lambda * (X - mu), axis=1) / v, -1.0 * D / 2 - v / 2) / special.gamma(v / 2) / np.power(
        np.pi * v, D / 2)


"""
绘制决策边界
"""


def plot_decision_function(X, y, clf, support_vectors=None):
    plot_step = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, alpha=0.4)
    plt.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k')
    # 绘制支持向量
    if support_vectors is not None:
        plt.scatter(X[support_vectors, 0], X[support_vectors, 1], s=80, c='none', alpha=0.7, edgecolor='red')


"""
绘制等高线
"""


def plot_contourf(data, func, lines=3):
    n = 256
    x = np.linspace(data[:, 0].min(), data[:, 0].max(), n)
    y = np.linspace(data[:, 1].min(), data[:, 1].max(), n)
    X, Y = np.meshgrid(x, y)
    C = plt.contour(X, Y, func(np.c_[X.reshape(-1), Y.reshape(-1)]).reshape(X.shape), lines, colors='g', linewidth=0.5)
    plt.clabel(C, inline=True, fontsize=10)
    plt.scatter(data[:, 0], data[:, 1])

NP_hard

关注

3
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
朴素贝叶斯分类器简介

文章目录算法推导算法流程手写代码实现算法推导算法流程如果没有Y的先验概率，就先计算Y的先验概率：P(Y=Ck)=(mk+λ)/(m+Kλ)P(Y=C_k)=(m_k+\lambda)/(m+K\lambda)P(Y=Ck)=(mk+λ)/(m+Kλ)分别计算第k个类别的第i维特征的第j个取值的条件概率:P(Xi=xij∣Y=Ck)P(X_i=x_{ij}|Y=C_k)P(Xi=xij∣Y=Ck)如果是离散值:P(Xi=xij∣Y=Ck)=mkij+λmk+SiλP(X_i=x_ij
复制链接

扫一扫

专栏目录