DataWhale-(scikit-learn教程)-Task03(贝叶斯)-202112

最新推荐文章于 2022-04-13 14:43:22 发布

JZT2015

最新推荐文章于 2022-04-13 14:43:22 发布

阅读量593

点赞数

文章标签： scikit-learn sklearn 机器学习朴素贝叶斯算法

本文链接：https://blog.csdn.net/sinat_36892485/article/details/122071170

版权

datawahle代码地址
参考周志华《机器学习》

1. 朴素贝叶斯算法

在这里插入图片描述

朴素贝叶斯代码实现

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns;sns.set()

from sklearn.datasets import make_blobs
# make_blobs：为聚类产生数据集
# n_samples：样本点数，n_features：数据的维度，centers:产生数据的中心点，默认值3
# cluster_std：数据集的标准差，浮点数或者浮点数序列，默认值1.0，random_state：随机种子
X, y = make_blobs(n_samples = 100, n_features=2, centers=2, random_state=2, cluster_std=1.5) 
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
plt.show()

from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
rng = np.random.RandomState(0)
X_test = [-6, -14] + [14, 18] * rng.rand(2000, 2)
y_pred = model.predict(X_test)

plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu')
lim = plt.axis()
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred, s=20, cmap='RdBu', alpha=0.1)
plt.axis(lim)
plt.show()

yprob = model.predict_proba(X_test)
yprob[-8:].round(2)

2. EM(Expectation-Maximization)算法

在这里插入图片描述

如何感性地理解EM算法？

1. numpy实现EM算法

本代码用于模拟k=2个正态分布的均值估计。其中ini_data(Sigma,Mu1,Mu2,k,N)函数用于生成训练样本，此训练样本时从两个高斯分布中随机生成的，其中高斯分布a均值Mu1=40、均方差Sigma=6，高斯分布b均值Mu2=20、均方差Sigma=6，生成的样本分布如下图所示。由于本问题中实现无法直接冲样本数据中获知两个高斯分布参数，因此需要使用EM算法估算出具体Mu1、Mu2取值。

# -*- coding: utf-8 -*-

import numpy as np
import math  
import copy  
import matplotlib.pyplot as plt  

isdebug = True

# 参考文献：机器学习TomM.Mitchell P.137
# 代码参考http://blog.csdn.net/chasdmeng/article/details/38709063

# 指定k个高斯分布参数，这里指定k=2。注意2个高斯分布具有相同均方差Sigma，均值分别为Mu1,Mu2。  
def init_data(Sigma,Mu1,Mu2,k,N):  
    global X  
    global Mu  
    global Expectations  
    X = np.zeros((1,N))  
    Mu = np.random.random(k)  
    Expectations = np.zeros((N,k))  
    for i in range(0,N):  
        if np.random.random(1) > 0.5:  
            X[0,i] = np.random.normal(Mu1, Sigma)
        else:  
            X[0,i] = np.random.normal(Mu2, Sigma)
    if isdebug:  
        print("***********")
        print("初始观测数据X：")
        print(X )
        
# EM算法：步骤1，计算E[zij]  
def e_step(Sigma, k, N):  
    global Expectations  
    global Mu  
    global X  
    for i in range(0,N):  
        Denom = 0 
        Numer = [0.0] * k
        for j in range(0,k):  
            Numer[j] = math.exp((-1/(2*(float(Sigma**2))))*(float(X[0,i]-Mu[j]))**2)  
            Denom += Numer[j]
        for j in range(0,k):  
            Expectations[i,j] = Numer[j] / Denom  
    if isdebug:  
        print("***********")
        print("隐藏变量E（Z）：")
        print(Expectations)
        
# EM算法：步骤2，求最大化E[zij]的参数Mu  
def m_step(k,N):  
    global Expectations  
    global X  
    for j in range(0,k):  
        Numer = 0  
        Denom = 0  
        for i in range(0,N):  
            Numer += Expectations[i,j]*X[0,i]  
            Denom +=Expectations[i,j]  
        Mu[j] = Numer / Denom
        
# 算法迭代iter_num次，或达到精度Epsilon停止迭代  
def run(Sigma,Mu1,Mu2,k,N,iter_num,Epsilon):  
    init_data(Sigma,Mu1,Mu2,k,N)  
    print("初始<u1,u2>:", Mu)
    for i in range(iter_num):  
        Old_Mu = copy.deepcopy(Mu)  
        e_step(Sigma,k,N)  
        m_step(k,N)  
        print(i,Mu)
        if sum(abs(Mu - Old_Mu)) < Epsilon:  
            break  

if __name__ == '__main__':
    sigma = 6   # 高斯分布具有相同的方差
    mu1 = 40    # 第一个高斯分布的均值 用于产生样本
    mu2 = 20    # 第二个高斯分布的均值 用于产生样本
    k = 2       # 高斯分布的个数
    N = 1000    # 样本个数
    iter_num = 1000 # 最大迭代次数
    epsilon = 0.0001    # 当两次误差小于这个时退出
    run(sigma,mu1,mu2,k,N,iter_num,epsilon)  
   
    plt.hist(X[0,:],50)  
    plt.show()

2. sklearn实现EM算法

EM算法实践

1. EM算法实现

import numpy as np
from scipy.stats import multivariate_normal
from sklearn.mixture import GaussianMixture
from mpl_toolkits.mplot3d import Axes3D
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import pairwise_distances_argmin


mpl.rcParams['font.sans-serif'] = ['SimHei']
mpl.rcParams['axes.unicode_minus'] = False


if __name__ == '__main__':
    #style = 'sklearn'
    style = 'myself'
    np.random.seed(0)
    mu1_fact = (0, 0, 0)
    cov1_fact = np.diag((1, 2, 3))
    # 根据实际情况生成一个多元正态分布矩阵，np.random.multivariate_normal
    # 参数就是高斯分布所需的均值与方差
    # 第一个参数： mean：mean是多维分布的均值维度为1；
    # 第二个参数：cov：协方差矩阵，注意：协方差矩阵必须是对称的且需为半正定矩阵；
    # 第三个参数：size：指定生成的正态分布矩阵的维度
    data1 = np.random.multivariate_normal(mu1_fact, cov1_fact, 400)
    print('data1 shape: {0}'.format(data1.shape))
    mu2_fact = (2, 2, 1)
    # 方差对称且正定(positive-semidefinite): (4, 1, 3), (1, 2, 1), (3, 1, 4)
    cov2_fact = np.array(((4, 1, 3), (1, 2, 1), (3, 1, 4)))
    data2 = np.random.multivariate_normal(mu2_fact, cov2_fact, 100)
    print('data2 shape: {0}'.format(data2.shape))

    data = np.vstack((data1, data2))
    print('data shape: {0}'.format(data.shape))
    y = np.array([True] * 400 + [False] * 100)

    if style == 'sklearn':
        g = GaussianMixture(n_components=2, covariance_type='full', tol=1e-6, max_iter=1000)
        g.fit(data)
        print('类别概率:\t', g.weights_[0])
        print('均值:\n', g.means_, '\n')
        print('方差:\n', g.covariances_, '\n')
        mu1, mu2 = g.means_
        sigma1, sigma2 = g.covariances_
    else:
        num_iter = 100
        n, d = data.shape
        # 随机指定
        # mu1 = np.random.standard_normal(d)
        # print mu1
        # mu2 = np.random.standard_normal(d)
        # print mu2
        mu1 = data.min(axis=0)
        mu2 = data.max(axis=0)
        # 创建d行d列的单位矩阵（对角线为1，其余为0）
        sigma1 = np.identity(d)
        sigma2 = np.identity(d)
        pi = 0.5
        # EM
        for i in range(num_iter):
            # E Step
            # 通过初始化的均值与方差，做多元的正态分布
            norm1 = multivariate_normal(mu1, sigma1)
            norm2 = multivariate_normal(mu2, sigma2)
            # 概率密度 * pi
            tau1 = pi * norm1.pdf(data)
            tau2 = (1 - pi) * norm2.pdf(data)
            gamma = tau1 / (tau1 + tau2)

            # M Step
            mu1 = np.dot(gamma, data) / np.sum(gamma)
            mu2 = np.dot((1 - gamma), data) / np.sum((1 - gamma))
            sigma1 = np.dot(gamma * (data - mu1).T, data - mu1) / np.sum(gamma)
            sigma2 = np.dot((1 - gamma) * (data - mu2).T, data - mu2) / np.sum(1 - gamma)
            pi = np.sum(gamma) / n
            print(i, ":\t", mu1, mu2)
        print('类别概率:\t', pi)
        print('均值:\t', mu1, mu2)
        print('方差:\n', sigma1, '\n\n', sigma2, '\n')

    # 预测分类
    # multivariate_normal获得多元正态分布
    norm1 = multivariate_normal(mu1, sigma1)
    norm2 = multivariate_normal(mu2, sigma2)
    # pdf: Probability density function，连续性概率分布函数
    tau1 = norm1.pdf(data)
    tau2 = norm2.pdf(data)

    fig = plt.figure(figsize=(10, 5), facecolor='w')
    ax = fig.add_subplot(121, projection='3d')
    ax.scatter(data[:, 0], data[:, 1], data[:, 2], c='b', s=30, marker='o', edgecolors='k', depthshade=True)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title('原始数据', fontsize=15)
    ax = fig.add_subplot(122, projection='3d')
    # 求取点距离
    order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='euclidean')
    # order = pairwise_distances_argmin([mu1_fact, mu2_fact], [mu1, mu2], metric='cosine')

    # 通过欧式距离，将点分为两类
    print(order)
    if order[0] == 0:
        c1 = tau1 > tau2
    else:
        c1 = tau1 < tau2
    c2 = ~c1
    # 机器学习计算准确率的常用做法
    # 原理：真实值是y，预测值是c1，相等则为True，否则为False。True为1，False为0
    # 求均值则为：预测准确的数目/总数目，这不就是准确率么
    acc = np.mean(y == c1)
    print('准确率：%.2f%%' % (100*acc))
    ax.scatter(data[c1, 0], data[c1, 1], data[c1, 2], c='r', s=30, marker='o', edgecolors='k', depthshade=True)
    ax.scatter(data[c2, 0], data[c2, 1], data[c2, 2], c='g', s=30, marker='^', edgecolors='k', depthshade=True)
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_zlabel('Z')
    ax.set_title('EM算法分类', fontsize=15)
    plt.suptitle('EM算法的实现', fontsize=18)
    plt.subplots_adjust(top=0.90)
    # plt.tight_layout()
    plt.show()

2. GMM与DPGMM比较

# !/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
import scipy as sp
import matplotlib as mpl
import matplotlib.colors
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse


def expand(a, b, rate=0.05):
    d = (b - a) * rate
    return a-d, b+d


matplotlib.rcParams['font.sans-serif'] = ['SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False


if __name__ == '__main__':
    np.random.seed(0)
    cov1 = np.diag((1, 2))
    N1 = 500
    N2 = 300
    N = N1 + N2
    x1 = np.random.multivariate_normal(mean=(3, 2), cov=cov1, size=N1)
    m = np.array(((1, 1), (1, 3)))
    x1 = x1.dot(m)
    x2 = np.random.multivariate_normal(mean=(-1, 10), cov=cov1, size=N2)
    x = np.vstack((x1, x2))
    y = np.array([0]*N1 + [1]*N2)
    n_components = 3

    # 绘图使用
    colors = '#A0FFA0', '#2090E0', '#FF8080'
    cm = mpl.colors.ListedColormap(colors)
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max()
    x1_min, x1_max = expand(x1_min, x1_max)
    x2_min, x2_max = expand(x2_min, x2_max)
    x1, x2 = np.mgrid[x1_min:x1_max:500j, x2_min:x2_max:500j]
    grid_test = np.stack((x1.flat, x2.flat), axis=1)

    plt.figure(figsize=(6, 6), facecolor='w')
    plt.suptitle('GMM/DPGMM比较', fontsize=15)

    ax = plt.subplot(211)
    gmm = GaussianMixture(n_components=n_components, covariance_type='full', random_state=0)
    gmm.fit(x)
    centers = gmm.means_
    covs = gmm.covariances_
    print('GMM均值 = \n', centers)
    print('GMM方差 = \n', covs)
    y_hat = gmm.predict(x)

    grid_hat = gmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020')

    clrs = list('rgbmy')
    for i, (center, cov) in enumerate(zip(centers, covs)):
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color=clrs[i], alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)

    ax1_min, ax1_max, ax2_min, ax2_max = plt.axis()
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title('GMM', fontsize=15)
    plt.grid(b=True, ls=':', color='#606060')

    # DPGMM
    dpgmm = BayesianGaussianMixture(n_components=n_components, covariance_type='full', max_iter=1000, n_init=5,
                                    weight_concentration_prior_type='dirichlet_process', weight_concentration_prior=0.1)
    dpgmm.fit(x)
    centers = dpgmm.means_
    covs = dpgmm.covariances_
    print('DPGMM均值 = \n', centers)
    print('DPGMM方差 = \n', covs)
    y_hat = dpgmm.predict(x)
    print(y_hat)

    ax = plt.subplot(212)
    grid_hat = dpgmm.predict(grid_test)
    grid_hat = grid_hat.reshape(x1.shape)
    plt.pcolormesh(x1, x2, grid_hat, cmap=cm)
    plt.scatter(x[:, 0], x[:, 1], s=20, c=y, cmap=cm, marker='o', edgecolors='#202020')

    for i, cc in enumerate(zip(centers, covs)):
        if i not in y_hat:
            continue
        center, cov = cc
        value, vector = sp.linalg.eigh(cov)
        width, height = value[0], value[1]
        v = vector[0] / sp.linalg.norm(vector[0])
        angle = 180* np.arctan(v[1] / v[0]) / np.pi
        e = Ellipse(xy=center, width=width, height=height,
                    angle=angle, color='m', alpha=0.5, clip_box = ax.bbox)
        ax.add_artist(e)
    plt.xlim((x1_min, x1_max))
    plt.ylim((x2_min, x2_max))
    plt.title('DPGMM', fontsize=15)
    plt.grid(b=True, ls=':', color='#606060')
    plt.tight_layout(2, rect=(0, 0, 1, 0.95))
    plt.show()

JZT2015

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
DataWhale-(scikit-learn教程)-Task03(贝叶斯)-202112

datawahle代码地址参考周志华《机器学习》1. 朴素贝叶斯算法朴素贝叶斯代码实现import numpy as npimport matplotlib.pyplot as pltimport seaborn as sns;sns.set()from sklearn.datasets import make_blobs# make_blobs：为聚类产生数据集# n_samples：样本点数，n_features：数据的维度，centers:产生数据的中心点，默认值3# c
复制链接

扫一扫