聚类
实验代码:
testGmm.m ——测试主函数
function testGmm()
X = generateData();
K = 2;
gamma = gmm(X, K);
%根据最大后验概率确定类别
[value,gamma_K] = max(gamma,[],2);
% show result
figure(3)
idx1 = find(gamma_K == 1);
plot(X(idx1,1),X(idx1,2),'ro', 'MarkerFaceColor','r');
hold on;
idx2 = find(gamma_K == 2);
plot(X(idx2,1),X(idx2,2),'bo','MarkerFaceColor','b');
title('K-Means Result');
end
generateData.m——产生模拟数据
function X = generateData()
%%
%产生混合高斯数据
%%
% show real data
figure(1)
mu = [2 3];
SIGMA = [1 0; 0 2];
r = mvnrnd(mu,SIGMA,100);
plot(r(:,1),r(:,2),'rx', 'MarkerFaceColor','r','LineWidth',2,'MarkerSize',10);
hold on;
mu = [7 8];
SIGMA = [ 1 0; 0 2];
r2 = mvnrnd(mu,SIGMA,100);
plot(r2(:,1),r2(:,2),'bx', 'MarkerFaceColor','b','LineWidth',2,'MarkerSize',10);
title('Real Data');
X = [r;r2];
figure(2)
% no label
plot(X(:,1),X(:,2),'mo', 'MarkerFaceColor','m');
title('No label data : X')
end
gmm.m——高斯混合模型的EM解法
function pGamma = gmm(X, K_or_centroids)
% ============================================================
% Expectation-Maximization iteration implementation of
% Gaussian Mixture Model.
%
% PX = GMM(X, K_OR_CENTROIDS)
% [PX MODEL] = GMM(X, K_OR_CENTROIDS)
%
% - X: N-by-D data matrix.
% - K_OR_CENTROIDS: either K indicating the number of
% components or a K-by-D matrix indicating the
% choosing of the initial K centroids.
%
% - PX: N-by-K matrix indicating the probability of each
% component generating each point.
% - MODEL: a structure containing the parameters for a GMM:
% MODEL.Miu: a K-by-D matrix.
% MODEL.Sigma: a D-by-D-by-K matrix.
% MODEL.Pi: a 1-by-K vector.
% ============================================================
threshold = 1e-15;
[N, D] = size(X);
%是否用k-means的聚类中心点作为初值
if isscalar(K_or_centroids)
K = K_or_centroids;
% randomly pick centroids
rndp = randperm(N);
centroids = X(rndp(1:K), :);
else
K = size(K_or_centroids, 1);
centroids = K_or_centroids;
end
% initial values
[pMiu pPi pSigma] = init_params();
Lprev = -inf;
while true
Px = calc_prob();
% E step : 求后验概率
% new value for pGamma
pGamma = Px .* repmat(pPi, N, 1);
pGamma = pGamma ./ repmat(sum(pGamma, 2), 1, K);
%求 pi, mu, Sigma
% new value for parameters of each Component
Nk = sum(pGamma, 1);
pMiu = diag(1./Nk) * pGamma' * X;
pPi = Nk/N;
for kk = 1:K
Xshift = X-repmat(pMiu(kk, :), N, 1);
pSigma(:, :, kk) = (Xshift' * ...
(diag(pGamma(:, kk)) * Xshift)) / Nk(kk);
end
%迭代终止条件
% check for convergence
L = sum(log(Px*pPi'));
if L-Lprev < threshold
break;
end
Lprev = L;
end
if nargout == 1
varargout = {Px};
else
model = [];
model.Miu = pMiu;
model.Sigma = pSigma;
model.Pi = pPi;
varargout = {Px, model};
end
%产生初始化参数
function [pMiu pPi pSigma] = init_params()
pMiu = centroids;
pPi = zeros(1, K);
pSigma = zeros(D, D, K);
% hard assign x to each centroids
distmat = repmat(sum(X.*X, 2), 1, K) + ...
repmat(sum(pMiu.*pMiu, 2)', N, 1) - ...
2*X*pMiu';
[dummy labels] = min(distmat, [], 2);
for k=1:K
Xk = X(labels == k, :);
pPi(k) = size(Xk, 1)/N;
pSigma(:, :, k) = cov(Xk);
end
end
%计算各数据点对应单独高斯分布中的概率值
function Px = calc_prob()
Px = zeros(N, K);
for k = 1:K
Xshift = X-repmat(pMiu(k, :), N, 1);
inv_pSigma = inv(pSigma(:, :, k));
tmp = sum((Xshift*inv_pSigma) .* Xshift, 2);
coef = (2*pi)^(-D/2) * sqrt(det(inv_pSigma));
Px(:, k) = coef * exp(-0.5*tmp);
end
end
end
gmm.m算法来自http://blog.pluskid.org/?p=39
实验结果:
(1)真实数据
(2)不含标签数据
(3)gmm结果
相关博客
(1)GMM的EM解法: EM算法(期望最大化)——应用:GMM http://blog.csdn.net/tingyue_/article/details/70576025
(2)K-Means实验:http://blog.csdn.net/tingyue_/article/details/70767601
(3)漫谈 Clustering (3): Gaussian Mixture Model:http://blog.pluskid.org/?p=39