1 高斯混合聚类算法
输入:
样本数据
D
=
c
o
r
1
,
c
o
r
2
,
.
.
.
,
c
o
r
m
,
c
o
r
i
=
(
x
i
,
y
i
)
D={cor_1, cor_2, ..., cor_m}, cor_i=(x_i, y_i)
D=cor1,cor2,...,corm,cori=(xi,yi),
聚类簇数:k
迭代次数:steps
计算过程:
1.初始化高斯混合分布模型参数:{(
α
i
,
μ
i
,
Σ
\alpha_i, \mu_i, \Sigma
αi,μi,Σ)|
1
≤
i
≥
k
1\leq i \geq k
1≤i≥k}
2.迭代steps步:
3.for j=1,2, …,m do
4.计算各数据的后验概率:
p
o
s
t
−
p
=
Σ
i
k
α
∗
p
(
x
∣
μ
i
,
Σ
i
)
post_-p = \Sigma_i^k \alpha* p(x\mid\mu_i, \Sigma_i)
post−p=Σikα∗p(x∣μi,Σi)
其中,
p
(
x
)
=
1
(
2
π
)
n
2
∣
Σ
∣
1
2
e
−
1
2
(
x
−
μ
)
Σ
−
1
(
x
−
μ
)
T
p(x)=\frac{1}{(2\pi)^{\frac{n}{2}}\mid\Sigma\mid^{\frac{1}{2}}}e^{-\frac{1}{2}(x-\mu)\Sigma^{-1}(x-\mu)^T}
p(x)=(2π)2n∣Σ∣211e−21(x−μ)Σ−1(x−μ)T,
p
o
s
t
−
p
=
γ
j
i
post_-p=\gamma_{ji}
post−p=γji
5.endfor
6.for i=1,2,…,k do
7.更新数据:
8.均值向量:
μ
i
′
=
Σ
j
=
1
m
γ
j
i
x
j
Σ
j
=
1
m
γ
j
i
\mu_i^{'}=\frac{\Sigma_{j=1}^{m}\gamma_{ji}x_j}{\Sigma_{j=1}^{m}\gamma_{ji}}
μi′=Σj=1mγjiΣj=1mγjixj
9.协方差矩阵:
Σ
i
′
=
Σ
j
=
1
m
γ
j
i
(
x
j
−
μ
j
′
)
T
(
x
j
−
μ
j
′
)
Σ
j
=
1
m
γ
j
i
\Sigma_i^{'}=\frac{\Sigma_{j=1}^m\gamma_{ji}(x_j-\mu_j^{'})^T(x_j-\mu_j^{'})}{\Sigma_{j=1}^m\gamma_{ji}}
Σi′=Σj=1mγjiΣj=1mγji(xj−μj′)T(xj−μj′)
10.混合系数:
α
i
′
=
Σ
j
=
1
m
γ
j
i
m
\alpha_i^{'}=\frac{\Sigma_{j=1}^m\gamma_{ji}}{m}
αi′=mΣj=1mγji
11.endfor
12.更新模型参数:{
(
α
i
,
μ
i
,
Σ
i
)
∣
1
≤
i
≤
k
{(\alpha_i, \mu_i, \Sigma_i)|1 \leq i \leq k}
(αi,μi,Σi)∣1≤i≤k}更新为{
(
α
i
′
,
μ
i
′
,
Σ
i
′
)
∣
1
≤
i
≤
k
{(\alpha_i^{'}, \mu_i^{'}, \Sigma_i^{'})|1 \leq i \leq k}
(αi′,μi′,Σi′)∣1≤i≤k}
13.for j=1, 2, …, m do
14.数据标记:挑选后验概率max
γ
j
i
\gamma_{ji}
γji
15.划分到相应的簇
16.endfor
输出:簇划分
c
l
u
s
t
e
r
1
,
c
l
u
s
t
e
r
2
,
.
.
.
,
c
l
u
s
t
e
r
k
cluster_1, cluster_2, ..., cluster_k
cluster1,cluster2,...,clusterk
2 python实现
2.1 Demo
import matplotlib.pyplot as plt
import numpy as np
import math
# 原始数据
x = [0.697, 0.774, 0.634, 0.608, 0.556, 0.403, 0.481, 0.437, 0.666, 0.243,
0.245, 0.343, 0.639, 0.657, 0.360, 0.593, 0.719, 0.359, 0.339, 0.282,
0.748, 0.714, 0.483, 0.478, 0.525, 0.751, 0.532, 0.473, 0.725, 0.446]
y = [0.460, 0.376, 0.264, 0.318, 0.215, 0.237, 0.149, 0.211, 0.091, 0.267,
0.057, 0.099, 0.161, 0.198, 0.370, 0.042, 0.103, 0.188, 0.241, 0.257,
0.232, 0.346, 0.312, 0.437, 0.369, 0.489, 0.472, 0.376, 0.445, 0.459]
# 矩阵测试
def test_matrix():
sigma = np.mat([[0.2, 0.1], [0.0, 0.1]])
sigma_Trans = sigma.T
sigma_inverse = sigma.I
print("sigma: {}".format(sigma))
print("sigma Inverse: {}".format(sigma_inverse))
print("sigma Transform: {}".format(sigma_Trans))
#
def gauss_density_probability(n, x, mu, sigma):
```
计算高斯概率密度。
参数:
n:数据维度
x:原始数据
mu:均值向量
sigma:协方差矩阵
返回:
p:高斯概率
```
sigma_det = np.linalg.det(sigma)
divisor = pow(2*np.pi, n/2)*np.sqrt(sigma_det)
exp = np.exp(-0.5*(x-mu)*sigma.I*(x-mu).T)
p = exp/divisor
return p
# 后验概率测试
def test_posterior_probability():
xx = np.mat([[x[0], y[0]]])
mu_datasets = [np.mat([[x[5], y[5]]]), np.mat([[x[21], y[21]]]), np.mat([[x[26], y[26]]])]
sigma = np.mat([[0.1, 0.0], [0.0, 0.1]])
det = np.linalg.det(sigma)
print("det: {}".format(det))
p_all = []
for mu in mu_datasets:
p = gauss_density_probability(2, xx, mu, sigma)
p = p/3
p_all.append(p)
p_mean = []
for p in p_all:
p_sum = np.sum(p_all)
p = p/p_sum
p_mean.append(p)
print("probability: {}".format(p_mean[0]))
def posterior_probability(k, steps):
```
高斯混合聚类+EM算法实现聚类。
参数:
k:簇类数
steps:迭代次数
返回:
p_all:后验概率
mu_datasets:均值矩阵
sigma_datasets:协方差矩阵
alpha_datasets:混合系数
classification_cluster:簇分类
```
alpha_datasets = [np.mat([1/k]) for _ in range(k)]
xx = [np.mat([[x[i], y[i]]]) for i in range(len(x))]
mu_rand = np.random.randint(0, 30, (1, k))
print("random: {}".format(mu_rand[0]))
# mu_datasets = [np.mat([[x[i], y[i]]]) for i in mu_rand[0]]
mu_datasets = [np.mat([[x[5], y[5]]]), np.mat([[x[21], y[21]]]), np.mat([[x[26], y[26]]])]
sigma_datasets = [np.mat([[0.1, 0.0], [0.0, 0.1]]) for _ in range(k)]
# det = np.linalg.det(sigma_datasets[0])
for step in range(steps):
p_all = []
# create cluster
classification_temp = locals()
for i in range(k):
classification_temp['cluster_'+str(i)] = []
# 后验概率分组
for j in range(len(x)):
p_group = []
for i in range(k):
mu = mu_datasets[i]
p = gauss_density_probability(2, xx[j], mu, sigma_datasets[i])
p = p*alpha_datasets[i].getA()[0][0]
p_group.append(p)
p_sum = np.sum(p_group)
# 取最大后验概率
max_p = max(p_group)
max_index = p_group.index(max_p)
# 最大后验概率聚类
for i in range(k):
if i == max_index:
classification_temp['cluster_'+str(max_index)].append(j)
p_group = [p_group[i]/p_sum for i in range(len(p_group))]
p_all.append(p_group)
# 更新 mu, sigma, alpha
mu_datasets = []
sigma_datasets = []
alpha_datasets = []
for i in range(k):
mu_temp_numerator = 0
mu_temp_denominator = 0
sigma_temp = 0
alpha_temp = 0
mu_numerator = [p_all[j][i]*xx[j] for j in range(len(x))]
for mm in mu_numerator:
mu_temp_numerator += mm
mu_denominator = [p_all[j][i] for j in range(len(x))]
for nn in mu_denominator:
mu_temp_denominator += nn
mu_dataset = mu_temp_numerator/mu_temp_denominator
mu_datasets.append(mu_dataset)
sigma = [p_all[j][i].getA()[0][0]*(xx[j]-mu_dataset).T*(xx[j]-mu_dataset) for j in range(len(x))]
for ss in sigma:
sigma_temp += ss
sigma_dataset = sigma_temp/mu_temp_denominator
sigma_datasets.append(sigma_dataset)
alpha_new = [p_all[j][i] for j in range(len(x))]
for alpha_nn in alpha_new:
alpha_temp += alpha_nn
alpha_dataset = alpha_temp/len(x)
alpha_datasets.append(alpha_dataset)
return p_all, mu_datasets, sigma_datasets, alpha_datasets, classification_temp
def cluster_visiualization(k, steps):
```
可视化聚类结果。
参数:
k:簇类数
steps:迭代次数
返回:
无
```
post_probability, mu_datasets, sigma_datasets, alpha_datasets, classification_temp = posterior_probability(k, steps)
plt.figure(figsize=(8, 8))
markers = ['.', 's', '^', '<', '>', 'P']
plt.xlim(0.1, 0.9)
plt.ylim(0, 0.9)
plt.grid()
plt.scatter(x, y, color='r')
plt.figure(figsize=(8, 8))
for i in range(k):
# 依据聚类获取对应数据,并显示
xx = [x[num] for num in classification_temp['cluster_'+str(i)]]
yy = [y[num] for num in classification_temp['cluster_'+str(i)]]
plt.xlim(0.1, 0.9)
plt.ylim(0, 0.9)
plt.grid()
plt.scatter(xx, yy, marker=markers[i])
plt.savefig("./images/gauss_cluster.png", format="png")
if __name__ == "__main__":
cluster_visiualization(3, 100)