高斯混合聚类算法及python实现

1 高斯混合聚类算法


输入:
样本数据 D = c o r 1 , c o r 2 , . . . , c o r m , c o r i = ( x i , y i ) D={cor_1, cor_2, ..., cor_m}, cor_i=(x_i, y_i) D=cor1,cor2,...,corm,cori=(xi,yi)
聚类簇数:k
迭代次数:steps
计算过程:

1.初始化高斯混合分布模型参数:{( α i , μ i , Σ \alpha_i, \mu_i, \Sigma αi,μi,Σ)| 1 ≤ i ≥ k 1\leq i \geq k 1ik}
2.迭代steps步:
3.for j=1,2, …,m do
4.计算各数据的后验概率: p o s t − p = Σ i k α ∗ p ( x ∣ μ i , Σ i ) post_-p = \Sigma_i^k \alpha* p(x\mid\mu_i, \Sigma_i) postp=Σikαp(xμi,Σi)
其中, p ( x ) = 1 ( 2 π ) n 2 ∣ Σ ∣ 1 2 e − 1 2 ( x − μ ) Σ − 1 ( x − μ ) T p(x)=\frac{1}{(2\pi)^{\frac{n}{2}}\mid\Sigma\mid^{\frac{1}{2}}}e^{-\frac{1}{2}(x-\mu)\Sigma^{-1}(x-\mu)^T} p(x)=(2π)2nΣ211e21(xμ)Σ1(xμ)T, p o s t − p = γ j i post_-p=\gamma_{ji} postp=γji
5.endfor
6.for i=1,2,…,k do
7.更新数据:
8.均值向量: μ i ′ = Σ j = 1 m γ j i x j Σ j = 1 m γ j i \mu_i^{'}=\frac{\Sigma_{j=1}^{m}\gamma_{ji}x_j}{\Sigma_{j=1}^{m}\gamma_{ji}} μi=Σj=1mγjiΣj=1mγjixj
9.协方差矩阵: Σ i ′ = Σ j = 1 m γ j i ( x j − μ j ′ ) T ( x j − μ j ′ ) Σ j = 1 m γ j i \Sigma_i^{'}=\frac{\Sigma_{j=1}^m\gamma_{ji}(x_j-\mu_j^{'})^T(x_j-\mu_j^{'})}{\Sigma_{j=1}^m\gamma_{ji}} Σi=Σj=1mγjiΣj=1mγji(xjμj)T(xjμj)
10.混合系数: α i ′ = Σ j = 1 m γ j i m \alpha_i^{'}=\frac{\Sigma_{j=1}^m\gamma_{ji}}{m} αi=mΣj=1mγji
11.endfor
12.更新模型参数:{ ( α i , μ i , Σ i ) ∣ 1 ≤ i ≤ k {(\alpha_i, \mu_i, \Sigma_i)|1 \leq i \leq k} (αi,μi,Σi)1ik}更新为{ ( α i ′ , μ i ′ , Σ i ′ ) ∣ 1 ≤ i ≤ k {(\alpha_i^{'}, \mu_i^{'}, \Sigma_i^{'})|1 \leq i \leq k} (αi,μi,Σi)1ik}
13.for j=1, 2, …, m do
14.数据标记:挑选后验概率max γ j i \gamma_{ji} γji
15.划分到相应的簇
16.endfor
输出:簇划分 c l u s t e r 1 , c l u s t e r 2 , . . . , c l u s t e r k cluster_1, cluster_2, ..., cluster_k cluster1,cluster2,...,clusterk


2 python实现

2.1 Demo

import matplotlib.pyplot as plt
import numpy as np
import math

# 原始数据
x = [0.697, 0.774, 0.634, 0.608, 0.556, 0.403, 0.481, 0.437, 0.666, 0.243,
     0.245, 0.343, 0.639, 0.657, 0.360, 0.593, 0.719, 0.359, 0.339, 0.282,
     0.748, 0.714, 0.483, 0.478, 0.525, 0.751, 0.532, 0.473, 0.725, 0.446]

y = [0.460, 0.376, 0.264, 0.318, 0.215, 0.237, 0.149, 0.211, 0.091, 0.267,
     0.057, 0.099, 0.161, 0.198, 0.370, 0.042, 0.103, 0.188, 0.241, 0.257,
     0.232, 0.346, 0.312, 0.437, 0.369, 0.489, 0.472, 0.376, 0.445, 0.459]
     
# 矩阵测试
def test_matrix():
    sigma = np.mat([[0.2, 0.1], [0.0, 0.1]])
    sigma_Trans = sigma.T
    sigma_inverse = sigma.I
    print("sigma: {}".format(sigma))
    print("sigma Inverse: {}".format(sigma_inverse))
    print("sigma Transform: {}".format(sigma_Trans))

# 
def gauss_density_probability(n, x, mu, sigma):
	```
	计算高斯概率密度。
	参数:
		n:数据维度
		x:原始数据
		mu:均值向量
		sigma:协方差矩阵
	返回:
		p:高斯概率
	```
    sigma_det = np.linalg.det(sigma)
    divisor = pow(2*np.pi, n/2)*np.sqrt(sigma_det)
    exp = np.exp(-0.5*(x-mu)*sigma.I*(x-mu).T)
    p = exp/divisor
    return p
# 后验概率测试
def test_posterior_probability():
    xx = np.mat([[x[0], y[0]]])
    mu_datasets = [np.mat([[x[5], y[5]]]), np.mat([[x[21], y[21]]]), np.mat([[x[26], y[26]]])]
    sigma = np.mat([[0.1, 0.0], [0.0, 0.1]])
    det = np.linalg.det(sigma)
    print("det: {}".format(det))
    p_all = []
    for mu in mu_datasets:
        p = gauss_density_probability(2, xx, mu, sigma)
        p = p/3
        p_all.append(p)
    p_mean = []
    for p in p_all:
        p_sum = np.sum(p_all)
        p = p/p_sum
        p_mean.append(p)
    print("probability: {}".format(p_mean[0]))

def posterior_probability(k, steps):
	```
	高斯混合聚类+EM算法实现聚类。
	参数:
		k:簇类数
		steps:迭代次数
	返回:
		p_all:后验概率
		mu_datasets:均值矩阵
		sigma_datasets:协方差矩阵
		alpha_datasets:混合系数
		classification_cluster:簇分类
	```
    alpha_datasets = [np.mat([1/k]) for _ in range(k)]
    xx = [np.mat([[x[i], y[i]]]) for i in range(len(x))]
    mu_rand = np.random.randint(0, 30, (1, k))
    print("random: {}".format(mu_rand[0]))
#     mu_datasets = [np.mat([[x[i], y[i]]]) for i in mu_rand[0]]
    mu_datasets = [np.mat([[x[5], y[5]]]), np.mat([[x[21], y[21]]]), np.mat([[x[26], y[26]]])]
    sigma_datasets = [np.mat([[0.1, 0.0], [0.0, 0.1]]) for _ in range(k)]
#     det = np.linalg.det(sigma_datasets[0])
    for step in range(steps):
        p_all = []
        # create cluster
        classification_temp = locals()
        for i in range(k):
            classification_temp['cluster_'+str(i)] = []
        # 后验概率分组  
        for j in range(len(x)):
            p_group = []
            for i in range(k):
                mu = mu_datasets[i]
                p = gauss_density_probability(2, xx[j], mu, sigma_datasets[i])

                p = p*alpha_datasets[i].getA()[0][0]
                p_group.append(p)
            p_sum = np.sum(p_group)
            # 取最大后验概率
            max_p = max(p_group)
            max_index = p_group.index(max_p)
            # 最大后验概率聚类
            for i in range(k):
                if i == max_index:
                    classification_temp['cluster_'+str(max_index)].append(j)
            
            p_group = [p_group[i]/p_sum for i in range(len(p_group))]
            p_all.append(p_group)

            

        # 更新 mu, sigma, alpha
        mu_datasets = []
        sigma_datasets = []
        alpha_datasets = []

        for i in range(k):
            mu_temp_numerator = 0
            mu_temp_denominator = 0
            sigma_temp = 0
            alpha_temp = 0
            mu_numerator = [p_all[j][i]*xx[j] for j in range(len(x))]
            for mm in mu_numerator:
                mu_temp_numerator += mm

            mu_denominator = [p_all[j][i] for j in range(len(x))]
            for nn in mu_denominator:
                mu_temp_denominator += nn

            mu_dataset = mu_temp_numerator/mu_temp_denominator
            mu_datasets.append(mu_dataset)

            sigma = [p_all[j][i].getA()[0][0]*(xx[j]-mu_dataset).T*(xx[j]-mu_dataset) for j in range(len(x))]
            for ss in sigma:
                sigma_temp += ss
            sigma_dataset = sigma_temp/mu_temp_denominator
            sigma_datasets.append(sigma_dataset)

            alpha_new = [p_all[j][i] for j in range(len(x))]
            for alpha_nn in alpha_new:
                alpha_temp += alpha_nn
            alpha_dataset = alpha_temp/len(x)
            alpha_datasets.append(alpha_dataset)
    return p_all, mu_datasets, sigma_datasets, alpha_datasets, classification_temp
    
def cluster_visiualization(k, steps):
	```
	可视化聚类结果。
	参数:
		k:簇类数
		steps:迭代次数
	返回:
		无
	```
    post_probability, mu_datasets, sigma_datasets, alpha_datasets, classification_temp = posterior_probability(k, steps)
    plt.figure(figsize=(8, 8))
    markers = ['.', 's', '^', '<', '>', 'P']
    plt.xlim(0.1, 0.9)
    plt.ylim(0, 0.9)
    plt.grid()
    plt.scatter(x, y, color='r')
    
    plt.figure(figsize=(8, 8))
    for i in range(k):
    	# 依据聚类获取对应数据,并显示
        xx = [x[num] for num in classification_temp['cluster_'+str(i)]]
        yy = [y[num] for num in classification_temp['cluster_'+str(i)]]
        
        plt.xlim(0.1, 0.9)
        plt.ylim(0, 0.9)
        plt.grid()
        plt.scatter(xx, yy, marker=markers[i])
    plt.savefig("./images/gauss_cluster.png", format="png")
   
if __name__ == "__main__":
    cluster_visiualization(3, 100)  

2.2 Result

在这里插入图片描述

图2.1 原始数据

在这里插入图片描述

图2.2 聚类后的数据展示
  • 7
    点赞
  • 80
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 13
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 13
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

天然玩家

坚持才能做到极致

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值