import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn import datasets
from collections import OrderedDict
class GMM():
def __init__(self, X, K, alpha=None, means=None, covars=None):
self.X = X
self.K = K
self.nSample, self.nDim = self.X.shape
self.means, self.covs, self.alpha = self.initial()
self.gamma = np.zeros((self.nSample,self.K))
self.PDF = np.zeros((self.nSample, self.K))
self.alpha_PDF = np.zeros((self.nSample, self.K))
self.sum_alpha_PDF = None
self.y_pred = self.EM()
def initial(self):
y_hat = KMeans(n_clusters=self.K).fit_predict(self.X)
labs, count = np.unique(y_hat, return_counts=True)
means = np.zeros((self.K,self.nDim))
for lab in range(self.K):
means[lab] = np.mean(self.X[y_hat == lab], axis=0)
# ----------------------------
covs = OrderedDict()
for lab in range(self.K):
diff = (self.X[y_hat == lab]-means[lab])
covs[lab] = diff.T @ diff
# ----------------------------
alpha = count / self.nSample
return means, covs, alpha
def gamma_update(self):
for k in range(self.K):
diff = (self.X - self.means[k]).T
self.PDF[:,k] = np.diagonal(1 / ((2 * np.pi) ** (self.nDim / 2) * np.linalg.det(self.covs[k]) ** 0.5) * np.exp(-0.5 * np.dot(np.dot(diff.T, np.linalg.inv(self.covs[k])), diff)))
self.alpha_PDF = self.PDF * self.alpha
self.sum_alpha_PDF = np.sum(self.alpha_PDF, axis=1)[:,None]
self.gamma = self.alpha_PDF / self.sum_alpha_PDF
def EM(self):
like = 0
old_like = 1
while np.abs(like - old_like) > 0.0000000001:
old_like = like
'''E-step'''
self.gamma_update()
'''M-step'''
sum_gamma = np.sum(self.gamma, axis=0)
for k in range(self.K):
self.means[k] = np.mean(self.X * self.gamma[:,k][:,None]) / sum_gamma[k]
cov_tmp = np.zeros((self.nDim, self.nDim))
for i in range(self.nSample):
cov_tmp += self.gamma[i,k] * np.outer(self.X[i]-self.means[k],self.X[i]-self.means[k])
self.covs[k] = cov_tmp / sum_gamma[k]
self.alpha[k] = sum_gamma[k]/self.nSample
'''compute likely'''
log_like = np.log2(self.sum_alpha_PDF.reshape(-1))
like = sum(log_like)
y_pred = np.argmax(self.gamma, axis=1)
return y_pred
if __name__ == '__main__':
X, y = datasets.load_iris(return_X_y=True)
model = GMM(X=X,K=3)
print(model.y_pred)
print("ACC",accuracy_score(y_true=y,y_pred=model.y_pred))
print(y)
准确率:88.67%