聚类的相关算法

https://www.cnblogs.com/demo-deng/p/7127979.html  GMM

 

 

代码实现:https://blog.csdn.net/SofaSofa_io/article/details/89708552

 

# 文件功能:实现 GMM 算法

import numpy as np
from numpy import *
import pylab
import random,math

import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import multivariate_normal
plt.style.use('seaborn')

class GMM(object):
    def __init__(self, n_clusters, max_iter=500):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
    # 屏蔽开始
    # 更新W
    def update_W(self,X, Mu, Var, Pi):
        n_points, n_clusters = len(X), len(Pi)
        pdfs = np.zeros(((n_points, n_clusters)))
        for i in range(n_clusters):
            pdfs[:, i] = Pi[i] * multivariate_normal.pdf(X, Mu[i], np.diag(Var[i]))
        W = pdfs / pdfs.sum(axis=1).reshape(-1, 1)
        return W

    # 更新pi
    def update_Pi(self,W):
        Pi = W.sum(axis=0) / W.sum()
        return Pi

    # 画出聚类图像
    def plot_clusters(self,X, Mu, Var, Mu_true=None, Var_true=None):
        colors = ['b', 'g', 'r']
        n_clusters = len(Mu)
        plt.figure(figsize=(10, 8))
        plt.axis([-10, 15, -5, 15])
        plt.scatter(X[:, 0], X[:, 1], s=5)
        ax = plt.gca()
        for i in range(n_clusters):
            plot_args = {'fc': 'None', 'lw': 2, 'edgecolor': colors[i], 'ls': ':'}
            ellipse = Ellipse(Mu[i], 3 * Var[i][0], 3 * Var[i][1], **plot_args)
            ax.add_patch(ellipse)
        if (Mu_true is not None) & (Var_true is not None):
            for i in range(n_clusters):
                plot_args = {'fc': 'None', 'lw': 2, 'edgecolor': colors[i], 'alpha': 0.5}
                ellipse = Ellipse(Mu_true[i], 3 * Var_true[i][0], 3 * Var_true[i][1], **plot_args)
                ax.add_patch(ellipse)
        plt.show()

    # 更新Mu
    def update_Mu(self,X, W):
        n_clusters = W.shape[1]
        Mu = np.zeros((n_clusters, 2))
        for i in range(n_clusters):
            Mu[i] = np.average(X, axis=0, weights=W[:, i])
        return Mu

    # 更新Var
    def update_Var(self,X, Mu, W):
        n_clusters = W.shape[1]
        Var = np.zeros((n_clusters, 2))
        for i in range(n_clusters):
            Var[i] = np.average((X - Mu[i]) ** 2, axis=0, weights=W[:, i])
        return Var

    # 计算log似然函数(代价函数)
    def logLH(self,X, Pi, Mu, Var):
        n_points, n_clusters = len(X), len(Pi)
        pdfs = np.zeros(((n_points, n_clusters)))
        for i in range(n_clusters):
            pdfs[:, i] = Pi[i] * multivariate_normal.pdf(X, Mu[i], np.diag(Var[i]))
        return np.mean(np.log(pdfs.sum(axis=1)))

    # 屏蔽结束
    # 这里主要是计算E M steps 返回后验概率值
    def fit(self, data):
        # 作业3
        # 屏蔽开始
        n_points = len(data)
        Mu = [[0, 0], [0, 0], [0, 0]]
        Var = [[1, 1], [1, 1], [1, 1]]
        Pi = [1 /self.n_clusters] * 3
        W = np.ones((n_points, self.n_clusters)) /self.n_clusters
        Pi = W.sum(axis=0) / W.sum()
        loglh=[]
        # 给定一批数据点,首先要初始化参数,在主函数中初始化了
        #这里要重新写一下
        for i in range(5):
            # self.plot_clusters(data, Mu, Var, true_Mu, true_Var)
            loglh.append(self.logLH(data, Pi, Mu, Var))
        # 输出后验概率,每一行中都有属于高斯模型的可能性
            W = self.update_W(data, Mu, Var, Pi)
        #输出参数
            Pi = self.update_Pi(W)
            Mu = self.update_Mu(data, W)
            print('log-likehood:%.3f' % loglh[-1])
            Var = self.update_Var(data, Mu, W)
        return W
        # 屏蔽结束


  #每个点在每个分类的概率,要利用上面获得的参数
    def predict(self, data):
        # 屏蔽开始
        W=self.fit(data)
        Label = np.zeros((data.shape[0],1))
        #遍历W的每一行 去找概率最大的所以的索引
        Label=W.argmax(axis=1)
        # 屏蔽结束
        return Label

# 生成仿真数据
def generate_X(true_Mu, true_Var):
    # 第一簇的数据
    num1, mu1, var1 = 400, true_Mu[0], true_Var[0]
    X1 = np.random.multivariate_normal(mu1, np.diag(var1), num1)
    # 第二簇的数据
    num2, mu2, var2 = 600, true_Mu[1], true_Var[1]
    X2 = np.random.multivariate_normal(mu2, np.diag(var2), num2)
    # 第三簇的数据
    num3, mu3, var3 = 1000, true_Mu[2], true_Var[2]
    X3 = np.random.multivariate_normal(mu3, np.diag(var3), num3)
    # 合并在一起
    X = np.vstack((X1, X2, X3))
    # 显示数据
    plt.figure(figsize=(10, 8))
    plt.axis([-10, 15, -5, 15])
    plt.scatter(X1[:, 0], X1[:, 1], s=5)
    plt.scatter(X2[:, 0], X2[:, 1], s=5)
    plt.scatter(X3[:, 0], X3[:, 1], s=5)
    plt.show()
    return X

if __name__ == '__main__':
    # 生成数据
    true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
    true_Var = [[1, 3], [2, 2], [6, 2]]
    X = generate_X(true_Mu, true_Var)

    gmm = GMM(n_clusters=3)
    gmm.fit(X)
    cat = gmm.predict(X)
    print(cat)
    

# 文件功能:实现 GMM 算法

import numpy as np
from numpy import *
import pylab
import random,math

import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import multivariate_normal
plt.style.use('seaborn')

class GMM(object):
    def __init__(self, n_clusters, max_iter=50):
        self.n_clusters = n_clusters
        self.ITER = max_iter
        self.mu = 0
        self.sigma = 0
        self.alpha = 0

    def multiGaussian(self,x, mu, sigma):
        return 1 / ((2 * np.pi) * pow(np.linalg.det(sigma), 0.5)) * np.exp(
            -0.5 * (x - mu).dot(np.linalg.pinv(sigma)).dot((x - mu).T))

    def computeGamma(self,X, mu, sigma, alpha, multiGaussian):
        n_samples = X.shape[0]
        n_clusters = len(alpha)
        gamma = np.zeros((n_samples, n_clusters))
        p = np.zeros(n_clusters)
        g = np.zeros(n_clusters)
        for i in range(n_samples):
            for j in range(n_clusters):
                p[j] = multiGaussian(X[i], mu[j], sigma[j])
                g[j] = alpha[j] * p[j]
            for k in range(n_clusters):
                gamma[i, k] = g[k] / np.sum(g)
        return gamma

    def fit(self, data):
        n_samples = data.shape[0]
        n_features = data.shape[1]
        '''
        mu=data[np.random.choice(range(n_samples),self.n_clusters)]
        '''
        alpha = np.ones(self.n_clusters) / self.n_clusters

        mu = np.array([[.403, .237], [.714, .346], [.532, .472]])

        sigma = np.full((self.n_clusters, n_features, n_features), np.diag(np.full(n_features, 0.1)))
        for i in range(self.ITER):
            gamma = self.computeGamma(data, mu, sigma, alpha, self.multiGaussian)
            alpha = np.sum(gamma, axis=0) / n_samples
            for i in range(self.n_clusters):
                mu[i] = np.sum(data * gamma[:, i].reshape((n_samples, 1)), axis=0) / np.sum(gamma, axis=0)[i]
                sigma[i] = 0
                for j in range(n_samples):
                    sigma[i] += (data[j].reshape((1, n_features)) - mu[i]).T.dot(
                        (data[j] - mu[i]).reshape((1, n_features))) * gamma[j, i]
                sigma[i] = sigma[i] / np.sum(gamma, axis=0)[i]
        self.mu = mu
        self.sigma = sigma
        self.alpha = alpha

    def predict(self, data):
        pred = self.computeGamma(data, self.mu, self.sigma, self.alpha, self.multiGaussian)
        cluster_results = np.argmax(pred, axis=1)
        return cluster_results

# 生成仿真数据
def generate_X(true_Mu, true_Var):
    # 第一簇的数据
    num1, mu1, var1 = 400, true_Mu[0], true_Var[0]
    X1 = np.random.multivariate_normal(mu1, np.diag(var1), num1)
    # 第二簇的数据
    num2, mu2, var2 = 600, true_Mu[1], true_Var[1]
    X2 = np.random.multivariate_normal(mu2, np.diag(var2), num2)
    # 第三簇的数据
    num3, mu3, var3 = 1000, true_Mu[2], true_Var[2]
    X3 = np.random.multivariate_normal(mu3, np.diag(var3), num3)
    # 合并在一起
    X = np.vstack((X1, X2, X3))
    # 显示数据
    plt.figure(figsize=(10, 8))
    plt.axis([-10, 15, -5, 15])
    plt.scatter(X1[:, 0], X1[:, 1], s=5)
    plt.scatter(X2[:, 0], X2[:, 1], s=5)
    plt.scatter(X3[:, 0], X3[:, 1], s=5)
    plt.show()
    return X

if __name__ == '__main__':
    # 生成数据
    true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
    true_Var = [[1, 3], [2, 2], [6, 2]]
    X = generate_X(true_Mu, true_Var)

    gmm = GMM(n_clusters=3)
    gmm.fit(X)
    cat = gmm.predict(X)
    print(cat)
    

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值