https://www.cnblogs.com/demo-deng/p/7127979.html GMM
代码实现:https://blog.csdn.net/SofaSofa_io/article/details/89708552
# 文件功能:实现 GMM 算法
import numpy as np
from numpy import *
import pylab
import random,math
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import multivariate_normal
plt.style.use('seaborn')
class GMM(object):
def __init__(self, n_clusters, max_iter=500):
self.n_clusters = n_clusters
self.max_iter = max_iter
# 屏蔽开始
# 更新W
def update_W(self,X, Mu, Var, Pi):
n_points, n_clusters = len(X), len(Pi)
pdfs = np.zeros(((n_points, n_clusters)))
for i in range(n_clusters):
pdfs[:, i] = Pi[i] * multivariate_normal.pdf(X, Mu[i], np.diag(Var[i]))
W = pdfs / pdfs.sum(axis=1).reshape(-1, 1)
return W
# 更新pi
def update_Pi(self,W):
Pi = W.sum(axis=0) / W.sum()
return Pi
# 画出聚类图像
def plot_clusters(self,X, Mu, Var, Mu_true=None, Var_true=None):
colors = ['b', 'g', 'r']
n_clusters = len(Mu)
plt.figure(figsize=(10, 8))
plt.axis([-10, 15, -5, 15])
plt.scatter(X[:, 0], X[:, 1], s=5)
ax = plt.gca()
for i in range(n_clusters):
plot_args = {'fc': 'None', 'lw': 2, 'edgecolor': colors[i], 'ls': ':'}
ellipse = Ellipse(Mu[i], 3 * Var[i][0], 3 * Var[i][1], **plot_args)
ax.add_patch(ellipse)
if (Mu_true is not None) & (Var_true is not None):
for i in range(n_clusters):
plot_args = {'fc': 'None', 'lw': 2, 'edgecolor': colors[i], 'alpha': 0.5}
ellipse = Ellipse(Mu_true[i], 3 * Var_true[i][0], 3 * Var_true[i][1], **plot_args)
ax.add_patch(ellipse)
plt.show()
# 更新Mu
def update_Mu(self,X, W):
n_clusters = W.shape[1]
Mu = np.zeros((n_clusters, 2))
for i in range(n_clusters):
Mu[i] = np.average(X, axis=0, weights=W[:, i])
return Mu
# 更新Var
def update_Var(self,X, Mu, W):
n_clusters = W.shape[1]
Var = np.zeros((n_clusters, 2))
for i in range(n_clusters):
Var[i] = np.average((X - Mu[i]) ** 2, axis=0, weights=W[:, i])
return Var
# 计算log似然函数(代价函数)
def logLH(self,X, Pi, Mu, Var):
n_points, n_clusters = len(X), len(Pi)
pdfs = np.zeros(((n_points, n_clusters)))
for i in range(n_clusters):
pdfs[:, i] = Pi[i] * multivariate_normal.pdf(X, Mu[i], np.diag(Var[i]))
return np.mean(np.log(pdfs.sum(axis=1)))
# 屏蔽结束
# 这里主要是计算E M steps 返回后验概率值
def fit(self, data):
# 作业3
# 屏蔽开始
n_points = len(data)
Mu = [[0, 0], [0, 0], [0, 0]]
Var = [[1, 1], [1, 1], [1, 1]]
Pi = [1 /self.n_clusters] * 3
W = np.ones((n_points, self.n_clusters)) /self.n_clusters
Pi = W.sum(axis=0) / W.sum()
loglh=[]
# 给定一批数据点,首先要初始化参数,在主函数中初始化了
#这里要重新写一下
for i in range(5):
# self.plot_clusters(data, Mu, Var, true_Mu, true_Var)
loglh.append(self.logLH(data, Pi, Mu, Var))
# 输出后验概率,每一行中都有属于高斯模型的可能性
W = self.update_W(data, Mu, Var, Pi)
#输出参数
Pi = self.update_Pi(W)
Mu = self.update_Mu(data, W)
print('log-likehood:%.3f' % loglh[-1])
Var = self.update_Var(data, Mu, W)
return W
# 屏蔽结束
#每个点在每个分类的概率,要利用上面获得的参数
def predict(self, data):
# 屏蔽开始
W=self.fit(data)
Label = np.zeros((data.shape[0],1))
#遍历W的每一行 去找概率最大的所以的索引
Label=W.argmax(axis=1)
# 屏蔽结束
return Label
# 生成仿真数据
def generate_X(true_Mu, true_Var):
# 第一簇的数据
num1, mu1, var1 = 400, true_Mu[0], true_Var[0]
X1 = np.random.multivariate_normal(mu1, np.diag(var1), num1)
# 第二簇的数据
num2, mu2, var2 = 600, true_Mu[1], true_Var[1]
X2 = np.random.multivariate_normal(mu2, np.diag(var2), num2)
# 第三簇的数据
num3, mu3, var3 = 1000, true_Mu[2], true_Var[2]
X3 = np.random.multivariate_normal(mu3, np.diag(var3), num3)
# 合并在一起
X = np.vstack((X1, X2, X3))
# 显示数据
plt.figure(figsize=(10, 8))
plt.axis([-10, 15, -5, 15])
plt.scatter(X1[:, 0], X1[:, 1], s=5)
plt.scatter(X2[:, 0], X2[:, 1], s=5)
plt.scatter(X3[:, 0], X3[:, 1], s=5)
plt.show()
return X
if __name__ == '__main__':
# 生成数据
true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
true_Var = [[1, 3], [2, 2], [6, 2]]
X = generate_X(true_Mu, true_Var)
gmm = GMM(n_clusters=3)
gmm.fit(X)
cat = gmm.predict(X)
print(cat)
# 文件功能:实现 GMM 算法
import numpy as np
from numpy import *
import pylab
import random,math
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from scipy.stats import multivariate_normal
plt.style.use('seaborn')
class GMM(object):
def __init__(self, n_clusters, max_iter=50):
self.n_clusters = n_clusters
self.ITER = max_iter
self.mu = 0
self.sigma = 0
self.alpha = 0
def multiGaussian(self,x, mu, sigma):
return 1 / ((2 * np.pi) * pow(np.linalg.det(sigma), 0.5)) * np.exp(
-0.5 * (x - mu).dot(np.linalg.pinv(sigma)).dot((x - mu).T))
def computeGamma(self,X, mu, sigma, alpha, multiGaussian):
n_samples = X.shape[0]
n_clusters = len(alpha)
gamma = np.zeros((n_samples, n_clusters))
p = np.zeros(n_clusters)
g = np.zeros(n_clusters)
for i in range(n_samples):
for j in range(n_clusters):
p[j] = multiGaussian(X[i], mu[j], sigma[j])
g[j] = alpha[j] * p[j]
for k in range(n_clusters):
gamma[i, k] = g[k] / np.sum(g)
return gamma
def fit(self, data):
n_samples = data.shape[0]
n_features = data.shape[1]
'''
mu=data[np.random.choice(range(n_samples),self.n_clusters)]
'''
alpha = np.ones(self.n_clusters) / self.n_clusters
mu = np.array([[.403, .237], [.714, .346], [.532, .472]])
sigma = np.full((self.n_clusters, n_features, n_features), np.diag(np.full(n_features, 0.1)))
for i in range(self.ITER):
gamma = self.computeGamma(data, mu, sigma, alpha, self.multiGaussian)
alpha = np.sum(gamma, axis=0) / n_samples
for i in range(self.n_clusters):
mu[i] = np.sum(data * gamma[:, i].reshape((n_samples, 1)), axis=0) / np.sum(gamma, axis=0)[i]
sigma[i] = 0
for j in range(n_samples):
sigma[i] += (data[j].reshape((1, n_features)) - mu[i]).T.dot(
(data[j] - mu[i]).reshape((1, n_features))) * gamma[j, i]
sigma[i] = sigma[i] / np.sum(gamma, axis=0)[i]
self.mu = mu
self.sigma = sigma
self.alpha = alpha
def predict(self, data):
pred = self.computeGamma(data, self.mu, self.sigma, self.alpha, self.multiGaussian)
cluster_results = np.argmax(pred, axis=1)
return cluster_results
# 生成仿真数据
def generate_X(true_Mu, true_Var):
# 第一簇的数据
num1, mu1, var1 = 400, true_Mu[0], true_Var[0]
X1 = np.random.multivariate_normal(mu1, np.diag(var1), num1)
# 第二簇的数据
num2, mu2, var2 = 600, true_Mu[1], true_Var[1]
X2 = np.random.multivariate_normal(mu2, np.diag(var2), num2)
# 第三簇的数据
num3, mu3, var3 = 1000, true_Mu[2], true_Var[2]
X3 = np.random.multivariate_normal(mu3, np.diag(var3), num3)
# 合并在一起
X = np.vstack((X1, X2, X3))
# 显示数据
plt.figure(figsize=(10, 8))
plt.axis([-10, 15, -5, 15])
plt.scatter(X1[:, 0], X1[:, 1], s=5)
plt.scatter(X2[:, 0], X2[:, 1], s=5)
plt.scatter(X3[:, 0], X3[:, 1], s=5)
plt.show()
return X
if __name__ == '__main__':
# 生成数据
true_Mu = [[0.5, 0.5], [5.5, 2.5], [1, 7]]
true_Var = [[1, 3], [2, 2], [6, 2]]
X = generate_X(true_Mu, true_Var)
gmm = GMM(n_clusters=3)
gmm.fit(X)
cat = gmm.predict(X)
print(cat)