GMM EM算法python实现

参考文章:https://blog.csdn.net/qq_30091945/article/details/81134598

在完成深蓝学院课程作业得时候找到了这篇文章,根据公式可以通俗易解代码内容,但使用for循环过多,非常影响聚类效率,所以自己写了下向量化的代码。 多行注释掉的内容是for循环部分

# 文件功能:实现 GMM 算法

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
plt.style.use('seaborn')

class GMM(object):
    def __init__(self, n_clusters, max_iter=50):
        self.n_clusters = n_clusters
        self.max_iter = max_iter

    def GMM_EM(self):
        '''
        利用EM算法进行优化GMM参数的函数
        :return: 返回数据属于每个分类的概率
        '''
        loglikelyhood = 0
        oldloglikelyhood = 1
        len, dim = np.shape(self.data)
        #gammas[len*n_clusters]后验概率表示第n个样本属于第k个混合高斯的概率,类似K-Means中的rnk,第n个点是否属于第k类
        gammas = np.zeros((len, self.n_clusters))
   		#迭代
        i = 0
        while np.abs(loglikelyhood - oldloglikelyhood) > 1e-4 or i < self.max_iter:
            oldloglikelyhood = loglikelyhood
            # E-STEP 求后验概率gammas, gammas[n]=P(z|x) = P(x|z)P(z) / sum(P(x|z)P(x))
            # 即给定点n,求此点属于各个高斯模型的概率 gammas[n] (1*n_clusters)
            
            '''
            for n in range(len):
            p_x =  P(x|z)P(z), p_x_sum = sum( p(x|z)p(x) ) self.weights是先验概率,Gaussian是高斯函数
            p_x = [self.weights[k] * self.Gaussian(self.data[n], self.means[k], self.covars[k])
                    for k in range(self.n_clusters)]
            p_x = np.array(p_x)
            p_x_sum = np.sum(p_x)
            gammas[n] = p_x / p_x_sum
            '''

            for k in range(self.n_clusters):
                prob = multivariate_normal.pdf(self.data,self.means[k],self.covars[k])
                gammas[:,k] = prob * self.weights[k]
                gamma_sum = np.sum(gammas,axis=1)
            for k in range(self.n_clusters):
                gammas[:,k] = gammas[:,k] / gamma_sum
            # M-Step
            for k in range(self.n_clusters):
                # Nk表示样本中有多少概率属于第k个高斯分布
                Nk = np.sum(gammas[:,k])
                #更新每个高斯分布的权重 pi
                self.weights[k] = 1.0 * Nk /len
                #更新高斯分布的均值 Mu
                '''self.means[k] = (1.0/Nk)*np.sum([gammas[n][k] * self.data[n] for n in range(len)],axis=0)'''
                self.means[k] = (1.0/Nk)*np.array([np.sum(gammas[:,k] * self.data[:,d]) for d in range(dim)])
                # 更新高斯分布的协方差矩阵 Var
                xdiffs = self.data - self.means[k]
                '''self.covars[k] = (1.0/Nk) * np.sum([gammas[n][k] * xdiffs[n].reshape((dim,1)).dot(xdiffs[n].reshape((1,dim))) for n in range(len)],axis=0)'''
                # w_xdiffs = w * xdiffs
                w_xdiffs = np.array([gammas[n][k] * xdiffs[n] for n in range(len)])
                self.covars[k] = (1.0/Nk) * np.dot(w_xdiffs.T,xdiffs)
            loglikelyhood = 0
            '''            
            for n in range(len):
                for k in range(self.n_clusters):
                    loglikelyhood += gammas[n][k]*(np.log(self.weights[k]) + np.log(self.Gaussian(self.data[n],self.means[k],self.covars[k])))
            '''
            for k in range(self.n_clusters):
                loglikelyhood += np.sum(gammas[:,k]*np.log(self.weights[k]) + gammas[:,k]*np.log(multivariate_normal.pdf(self.data,self.means[k],self.covars[k])))
            i += 1

        # 屏蔽结束
    
    def fit(self, data):

        '''
        :param data: 训练数据
        :param n_clusters:高斯分布的个数
        :param weights:每个高斯分布的初始权重,即先验概率P(zk=1)(一个未知的点属于k高斯模型的概率)
        :param means:高斯分布的均值向量
        :param covars:高斯分布的协方差矩阵集合
        '''
        self.data = data
        self.weights = np.random.rand(self.n_clusters)
        self.weights /= np.sum(self.weights)
        dim = np.shape(self.data)[1]
        self.means = []
        #产生n_clusters个均值
        for i in range(self.n_clusters):
            mean = np.random.rand(dim)
            self.means.append(mean)
        self.covars = []
        #产生 n_clusters个协方差
        for i in range(self.n_clusters):
            cov = np.eye(dim)
            self.covars.append(cov)
        self.GMM_EM()



    def Gaussian(self, x, mean, cov):
        '''
        这是自定义的高斯分布概率密度函数)      
        :param x: 输入一个数据点
        :param mean: 高斯分布的均值
        :param cov: 高斯分布的协方差矩阵
        :return: x点的概率
        '''
        dim = np.shape(cov)[0]
        covdet = np.linalg.det(cov)
        covinv = np.linalg.inv(cov)
        xdiff = (x - mean).reshape((1, dim))
        # 概率密度
        prob = 1.0 / (np.power(2 * np.pi, dim / 2) * np.power(covdet, 1 / 2)) * \
               np.exp(-1 / 2 * xdiff.dot(covinv).dot(xdiff.T))[0][0]
        return prob    
        
    #from scipy.stats import multivariate_normal
    #prob = multivariate_normal.pdf(x,mean,cov)
    def Gaussian2(self,x,mean,cov):
        '''
        这是自定义的高斯分布概率密度函数,和 prob = multivariate_normal.pdf(x,mean,cov)实现同样功能
        :param x: 输入数据点矩阵
        :param mean: 高斯分布的均值
        :param cov: 高斯分布的协方差矩阵
        :return: x点的概率
        '''
        dim = np.shape(cov)[0]
        covdet = np.linalg.det(cov)
        covinv = np.linalg.inv(cov)
        probs = []
        for i in range(x.shape(0)):
            xdiff = (x[i] - mean).reshape((1,dim))
            #概率密度
            prob = 1.0/(np.power(2*np.pi,dim/2)*np.power(covdet,1/2))*\
                np.exp(-1/2*xdiff.dot(covinv).dot(xdiff.T))[0][0]
            probs.append(prob)
        probs = np.array(probs)
        return probs
    def predict(self, data):
        # 屏蔽开始
        len = data.shape[0]
        gammas = np.zeros((len, self.n_clusters))
        for k in range(self.n_clusters):
            prob = multivariate_normal.pdf(self.data, self.means[k], self.covars[k])
            gammas[:, k] = prob * self.weights[k]
            gamma_sum = np.sum(gammas, axis=1)
        for k in range(self.n_clusters):
            gammas[:, k] = gammas[:, k] / gamma_sum
        self.posibility = gammas
        self.prediction = np.array([np.argmax(gammas[i]) for i in range(len)])
        return self.prediction
        # 屏蔽结束

# 生成仿真数据
def generate_X(true_Mu, true_Var):
    # 第一簇的数据
    num1, mu1, var1 = 400, true_Mu[0], true_Var[0]
    X1 = np.random.multivariate_normal(mu1, np.diag(var1), num1)
    # 第二簇的数据
    num2, mu2, var2 = 600, true_Mu[1], true_Var[1]
    X2 = np.random.multivariate_normal(mu2, np.diag(var2), num2)
    # 第三簇的数据
    num3, mu3, var3 = 1000, true_Mu[2], true_Var[2]
    X3 = np.random.multivariate_normal(mu3, np.diag(var3), num3)
    # 合并在一起
    X = np.vstack((X1, X2, X3))
    # 显示数据
    '''    
    plt.figure(figsize=(10, 8))
    plt.axis([-10, 15, -5, 15])
    plt.scatter(X1[:, 0], X1[:, 1], s=5)
    plt.scatter(X2[:, 0], X2[:, 1], s=5)
    plt.scatter(X3[:, 0], X3[:, 1], s=5)
    plt.show()
    '''
    return X

if __name__ == '__main__':
    # 生成数据
    true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
    true_Var = [[1, 3], [2, 2], [6, 2]]
    X = generate_X(true_Mu, true_Var)

    gmm = GMM(n_clusters=3)
    gmm.fit(X)
    cat = gmm.predict(X)
    print("gmm.means:",gmm.means)
    print("gmm.var",gmm.covars)
    print("cat:",cat)
    plt.figure(figsize=(10, 8))
    plt.axis([-10, 15, -5, 15])
    plt.scatter(X[cat == 0, 0], X[cat == 0, 1], c='r', s=5)
    plt.scatter(X[cat == 1, 0], X[cat == 1, 1], c='b', s=5)
    plt.scatter(X[cat == 2, 0], X[cat == 2, 1], c='y', s=5)
    plt.show()



在这里插入图片描述

  • 4
    点赞
  • 21
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
GMM(Gaussian Mixture Model)是一种基于高斯分布的概率模型,常用于聚类或密度估计。EM(Expectation-Maximization)算法是一种迭代算法,通常用于GMM的参数估计。下面是使用Python实现GMMEM算法的示例代码: ``` import numpy as np from sklearn.mixture import GaussianMixture # 生成随机数据 np.random.seed(0) X = np.concatenate([np.random.randn(100, 2) + [2, 2], np.random.randn(100, 2) + [-2, -2], np.random.randn(100, 2) + [2, -2]]) # 初始化GMM模型 gmm = GaussianMixture(n_components=3, covariance_type='full') # 训练模型 gmm.fit(X) # 打印聚类结果 print(gmm.predict(X)) # 打印GMM模型参数 print('Means:') print(gmm.means_) print('Covariances:') print(gmm.covariances_) print('Weights:') print(gmm.weights_) ``` 这段代码使用了`sklearn.mixture.GaussianMixture`类,它可以方便地进行GMM模型的训练和参数估计。其中,`n_components`参数指定了聚类个数,`covariance_type`参数指定了协方差矩阵类型。在上面的例子中,我们使用了`'full'`类型,即完整协方差矩阵。 下面是使用Python实现EM算法的示例代码: ``` import numpy as np # 初始化参数 np.random.seed(0) K = 3 N = 300 mu = np.array([[-2, 2], [2, 2], [0, -2]]) sigma = np.array([[[1, 0], [0, 1]], [[1, 0.5], [0.5, 1]], [[0.5, 0], [0, 0.5]]]) alpha = np.ones(K) / K x = np.zeros((N, 2)) for i in range(K): x[i * 100:(i + 1) * 100, :] = np.random.multivariate_normal(mu[i, :], sigma[i, :, :], 100) # EM算法迭代 for t in range(10): # E步:计算后验概率 gamma = np.zeros((N, K)) for k in range(K): gamma[:, k] = alpha[k] * np.exp(-0.5 * np.sum((x - mu[k, :]) ** 2 / sigma[k, :, :], axis=1)) / np.sqrt(np.linalg.det(sigma[k, :, :])) gamma /= np.sum(gamma, axis=1, keepdims=True) # M步:更新模型参数 for k in range(K): Nk = np.sum(gamma[:, k]) mu[k, :] = np.sum(gamma[:, k].reshape(-1, 1) * x, axis=0) / Nk sigma[k, :, :] = np.sum(gamma[:, k].reshape(-1, 1, 1) * np.matmul((x - mu[k, :]).reshape(-1, 2, 1), (x - mu[k, :]).reshape(-1, 1, 2)), axis=0) / Nk alpha[k] = Nk / N # 打印模型参数 print('Iteration', t + 1) print('Means:') print(mu) print('Covariances:') print(sigma) print('Weights:') print(alpha) ``` 这段代码使用了EM算法来估计GMM模型的参数。其中,`mu`、`sigma`和`alpha`分别表示高斯分布的均值、协方差矩阵和权重,`gamma`表示后验概率。在每一轮迭代中,首先计算后验概率,然后根据后验概率更新模型参数。迭代结束后,打印出模型参数。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值