task
Generate clustering dataset using sklearn
• https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html
Implement your own version of
• K-Means
• GMM
• Spectral Clustering
Visualize and compare the results with the standard results
• https://scikit-learn.org/stable/modules/clustering.html#overview-of-clustering-methods
solution
在google colab上完成并运行
1.sklearn的聚类方法效果
print(__doc__)
import time
import warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
np.random.seed(0)
# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)
# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
cluster_std=[1.0, 2.5, 0.5],
random_state=random_state)
# ============
# Set up cluster parameters
# ============
plt.figure(figsize=(9 * 2 + 3, 12.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
default_base = {'quantile': .3,
'eps': .3,
'damping': .9,
'preference': -200,
'n_neighbors': 10,
'n_clusters': 3,
'min_samples': 20,
'xi': 0.05,
'min_cluster_size': 0.1}
datasets = [
(noisy_circles, {'damping': .77, 'preference': -240,
'quantile': .2, 'n_clusters': 2,
'min_samples': 20, 'xi': 0.25}),
(noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
(varied, {'eps': .18, 'n_neighbors': 2,
'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),
(aniso, {'eps': .15, 'n_neighbors': 2,
'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),
(blobs, {}),
(no_structure, {})]
for i_dataset, (dataset, algo_params) in enumerate(datasets):
# update parameters with dataset-specific values
params = default_base.copy()
params.update(algo_params)
X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
# estimate bandwidth for mean shift
bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
# connectivity matrix for structured Ward
connectivity = kneighbors_graph(
X, n_neighbors=params['n_neighbors'], include_self=False)
# make connectivity symmetric
connectivity = 0.5 * (connectivity + connectivity.T)
# ============
# Create cluster objects
# ============
ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
ward = cluster.AgglomerativeClustering(
n_clusters=params['n_clusters'], linkage='ward',
connectivity=connectivity)
spectral = cluster.SpectralClustering(
n_clusters=params['n_clusters'], eigen_solver='arpack',
affinity="nearest_neighbors")
dbscan = cluster.DBSCAN(eps=params['eps'])
optics = cluster.OPTICS(min_samples=params['min_samples'],
xi=params['xi'],
min_cluster_size=params['min_cluster_size'])
affinity_propagation = cluster.AffinityPropagation(
damping=params['damping'], preference=params['preference'])
average_linkage = cluster.AgglomerativeClustering(
linkage="average", affinity="cityblock",
n_clusters=params['n_clusters'], connectivity=connectivity)
birch = cluster.Birch(n_clusters=params['n_clusters'])
gmm = mixture.GaussianMixture(
n_components=params['n_clusters'], covariance_type='full')
clustering_algorithms = (
('MiniBatchKMeans', two_means),
('AffinityPropagation', affinity_propagation),
('MeanShift', ms),
('SpectralClustering', spectral),
('Ward', ward),
('AgglomerativeClustering', average_linkage),
('DBSCAN', dbscan),
('OPTICS', optics),
('Birch', birch),
('GaussianMixture', gmm)
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected components of the " +
"connectivity matrix is [0-9]{1,2}" +
" > 1. Completing it to avoid stopping the tree early.",
category=UserWarning)
warnings.filterwarnings(
"ignore",
message="Graph is not fully connected, spectral embedding" +
" may not work as expected.",
category=UserWarning)
algorithm.fit(X)
t1 = time.time()
if hasattr(algorithm, 'labels_'):
y_pred = algorithm.labels_.astype(np.int)
else:
y_pred = algorithm.predict(X)
plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(max(y_pred) + 1))))
# add black color for outliers (if any)
colors = np.append(colors, ["#000000"])
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
2.自己版本的三个聚类方法的实现
(1)Kmeans
import numpy as np
import random
def K_means(X:np.ndarray,K:int,threshold:float):
#初始化
indset=list(random.sample(range(0,X.shape[0]),K))
centerset=[]
for ind in indset:
centerset.append(X[ind])
centerset=np.array(centerset)
#centerset=np.array([[-5,-5],[7,6],[4,0]])
newcenterset=np.copy(centerset)
pred=np.array(np.random.randint(0,K,size=X.shape[0]))
stable=False
while stable==False:
# E step
for i,point in enumerate(X):
diff=np.linalg.norm(np.expand_dims(point, 0)-centerset,axis=1)
nn_idx = np.argsort(diff)
pred[i]=nn_idx[0]
# M step
for indvalue in range(len(centerset)):
indexset=np.where(pred==indvalue)
newcenterset[indvalue]=np.mean(X[indexset],0)
moves=np.linalg.norm(newcenterset-centerset,axis=1)
m_ids=np.argsort(moves)
#print(m_ids[-1])
if moves[m_ids[-1]]<=threshold:
stable=True
centerset=np.copy(newcenterset)
return pred,centerset
𝑁 input data points, find 𝐾 clusters.
- Randomly select K center points
- Each data point is assigned to one of
the K centers. - Re-compute the K centers by the mean
of each group - Iterate step 2 & 3.
TIPs:
1.Kmeans方法对初值比较敏感,上述代码的初值中心点为随机选择,可能会出现聚类效果很差的情况,需要多次尝试可得到较好的聚类效果2.这里判断EM循环步骤的中止条件的依据为新旧中心点的欧式距离,小于一个阈值则停止,阈值的大小需要在数据集的基础上合理选择
(2)GMM
import numpy as np
import random
import math
def Guass(x:np.ndarray,miu:np.ndarray,sigma:np.ndarray):
D=x.shape[1]
c1=np.matmul((x-miu).T,np.linalg.inv(sigma))
c2=np.matmul(c1,(x-miu))
#print(sigma,np.linalg.det(sigma),math.pow(abs(np.linalg.det(sigma)),-0.5))
return math.pow(2*np.pi,-D/2)*math.pow(abs(np.linalg.det(sigma)),-0.5)*np.exp(-0.5*c2)
def GMM(X:np.ndarray,K:int,MAX_MLE_value=-1e10,point_seq=1e-4,mle_seq=1e-5,EMcountMAX=150):
if X.shape[0]<=X.shape[1]:
return None
#初始化
stop=False
EMcount=0
X=X.T
miu=[]
sigma=[]
pi=[]
for _ in range(K):
tempn=K+3
if tempn <=X.shape[0] or tempn>=X.shape[1]:
tempn=X.shape[1]-1
tempX=X[:,random.sample(range(0,X.shape[1]),tempn)]
miu.append(np.expand_dims(np.mean(tempX,axis=1),axis=1))
sigma.append(np.cov(tempX))
pi.append(np.random.rand())
psum=sum(pi)
for i in range(len(pi)):
pi[i]=pi[i]/psum
miu=np.array(miu)
sigma=np.array(sigma)
pi=np.array(pi)
pointnums=X.shape[1]
probability=np.zeros((pointnums,K))
while stop==False:
MLE_value=0
#E step
for i in range(pointnums):
x=np.expand_dims(X[:,i],axis=1)
tempsum=0
for k in range(K):
if np.linalg.norm(x-miu[k])<point_seq:#为解决奇点问题
tempsum+=pi[k]*Guass(x+point_seq,miu[k],sigma[k])[0][0]
else:
tempsum+=pi[k]*Guass(x,miu[k],sigma[k])[0][0]
for k in range(K):
if np.linalg.norm(x-miu[k])<point_seq:#为解决奇点问题
probability[i][k]=pi[k]*Guass(x+point_seq,miu[k],sigma[k])[0][0]/tempsum
else:
probability[i][k]=pi[k]*Guass(x,miu[k],sigma[k])[0][0]/tempsum
MLE_value+=math.log(tempsum)
#M step
for k in range(K):
N_k=sum(probability[:,k])
# for i in range(X.shape[1]):
# x=np.expand_dims(X[:,i],axis=1)
gamaset=np.multiply(X,probability[:,k])
miu[k]=np.expand_dims(gamaset.sum(axis=1),axis=1)/N_k
tempsigma=np.zeros((X.shape[0],X.shape[0]))
for i in range(pointnums):
x=np.expand_dims(X[:,i],axis=1)
tempsigma+=probability[i,k]*np.matmul(x-miu[k],(x-miu[k]).T)
sigma[k]=tempsigma/N_k
pi[k]=N_k/pointnums
EMcount+=1
if MLE_value>MAX_MLE_value and EMcount<EMcountMAX:
if np.abs(MLE_value-MAX_MLE_value)<mle_seq:
stop=True
MAX_MLE_value=MLE_value
else:
stop=True
pred=np.zeros(pointnums)
pred=np.random.randint(0,K,size=pointnums)
for i in range(pointnums):
temppro=list(probability[i,:])
pred[i]=int(temppro.index(max(temppro)))
return pred,probability
- Initialize the means 𝜇𝑘, covariances Σ𝑘 and weights 𝜋𝑘
- E-step. Evaluate the posterior 𝑝(𝑧𝑛𝑘 = 1|𝑥𝑛), intuitively this is the probability of 𝑥𝑛 being
assigned to each of the K clusters
γ ( z n k ) = π k N ( x n ∣ μ k , Σ k ) ∑ j = 1 K π j N ( x n ∣ μ j , Σ j ) \gamma\left(z_{n k}\right)=\frac{\pi_{k} \mathcal{N}\left(\mathbf{x}_{n} | \boldsymbol{\mu}_{k}, \mathbf{\Sigma}_{k}\right)}{\sum_{j=1}^{K} \pi_{j} \mathcal{N}\left(\mathbf{x}_{n} | \boldsymbol{\mu}_{j}, \mathbf{\Sigma}_{j}\right)} γ(znk)=∑j=1KπjN(xn∣μj,Σj)πkN(xn∣μk,Σk)
- M-Step. Estimate the parameters using MLE.
- Evaluate the log likelihood, if converges, stop. Otherwise go back to E-step
N k = ∑ n = 1 N γ ( z n k ) N_{k}=\sum_{n=1}^{N} \gamma\left(z_{n k}\right) Nk=∑n=1Nγ(znk)
μ
k
new
=
1
N
k
∑
n
=
1
N
γ
(
z
n
k
)
x
n
\boldsymbol{\mu}_{k}^{\text {new }}=\frac{1}{N_{k}} \sum_{n=1}^{N} \gamma\left(z_{n k}\right) \mathbf{x}_{n}
μknew =Nk1∑n=1Nγ(znk)xn
Σ
k
new
=
1
N
k
∑
n
=
1
N
γ
(
z
n
k
)
(
x
n
−
μ
k
new
)
(
x
n
−
μ
k
new
)
T
\boldsymbol{\Sigma}_{k}^{\text {new }}=\frac{1}{N_{k}} \sum_{n=1}^{N} \gamma\left(z_{n k}\right)\left(\mathbf{x}_{n}-\boldsymbol{\mu}_{k}^{\text {new }}\right)\left(\mathbf{x}_{n}-\boldsymbol{\mu}_{k}^{\text {new }}\right)^{\text {T }}
Σknew =Nk1∑n=1Nγ(znk)(xn−μknew )(xn−μknew )T
π
k
new
=
N
k
N
\pi_{k}^{\text {new }}=\frac{N_{k}}{N}
πknew =NNk
ln p ( X ∣ μ , Σ , π ) = ∑ n = 1 N ln { ∑ k = 1 K π k N ( x n ∣ μ k , Σ k ) } \ln p(\mathbf{X} | \boldsymbol{\mu}, \boldsymbol{\Sigma}, \boldsymbol{\pi})=\sum_{n=1}^{N} \ln \left\{\sum_{k=1}^{K} \pi_{k} \mathcal{N}\left(\mathbf{x}_{n} | \boldsymbol{\mu}_{k}, \boldsymbol{\Sigma}_{k}\right)\right\} lnp(X∣μ,Σ,π)=∑n=1Nln{∑k=1KπkN(xn∣μk,Σk)}
TIPs:
1.获取初值时,上述方法为随机选点然后计算均值、协方差矩阵等。选取的点的数量不能太少,至少需要大于点的维数,即m>n;同时也不能太多,GMM方法同样对初值敏感,点太多计算出来的初值基本覆盖了所有数据,很难再分类
2.上述判断EM循环中止的条件为最大似然函数 ln p ( X ∣ μ , Σ , π ) = ∑ n = 1 N ln { ∑ k = 1 K π k N ( x n ∣ μ k , Σ k ) } \ln p(\mathbf{X} | \boldsymbol{\mu}, \boldsymbol{\Sigma}, \boldsymbol{\pi})=\sum_{n=1}^{N} \ln \left\{\sum_{k=1}^{K} \pi_{k} \mathcal{N}\left(\mathbf{x}_{n} | \boldsymbol{\mu}_{k}, \boldsymbol{\Sigma}_{k}\right)\right\} lnp(X∣μ,Σ,π)=∑n=1Nln{∑k=1KπkN(xn∣μk,Σk)},使其一直保持递增,否则循环中止;同时设置最大循环次数防止其无穷递增
3.上述代码还未很好地处理奇点问题,需要改进
(3)Spectral Clustering
import numpy as np
import random
from scipy import spatial
from sklearn.cluster import KMeans
def Spectral_Clusting(X:np.ndarray,K:int,add_knum=5):
pointnums=X.shape[0]
W=np.zeros((pointnums,pointnums))
sigma=1
knum=K+add_knum
if knum+1 >=pointnums:
knum=pointnums-1
tree = spatial.KDTree(X)
for i in range(pointnums):
point=X[i]
scipy_nn_dis,scipy_nn_idx=tree.query(point,knum+1)
#近邻的个数若太少会影响聚类效果,太多则计算更复杂
scipy_nn_idx=scipy_nn_idx[1:]
scipy_nn_dis=scipy_nn_dis[1:]
for k in range(len(scipy_nn_idx)):
j=scipy_nn_idx[k]
W[j][i]=W[i][j]=np.exp(-(scipy_nn_dis[k]**2)/(2*sigma**2))
dset=[]
for i in range(W.shape[0]):
dset.append(sum(W[i,:]))
D=np.diag(dset)
L=D-W
#normailze: D^(-1/2) L D^(-1/2)
sqrtDegreeMatrix = np.diag(1.0 / (np.array(dset) ** (0.5)))
L=np.dot(np.dot(sqrtDegreeMatrix, L), sqrtDegreeMatrix)
e_vals,e_vecs = np.linalg.eig(L)
sorted_indices = np.argsort(e_vals)
V=e_vecs[:,sorted_indices[:K]]
p,_=K_means(V,K,0.01)
#p = KMeans(n_clusters=K).fit_predict(V)
return p,V
- Build the graph to get adjacency matrix 𝑊 ∈ ℝ𝑛×𝑛
- Compute normalized Laplacian 𝐿′ = 𝐿𝑟𝑤
- Compute the first (smallest) 𝑘 eigenvectors 𝑣1, ⋯ , 𝑣𝑘 of 𝐿′
- Let 𝑉 ∈ ℝ𝑛×𝑘 be the matrix contraining the vectors 𝑣1,⋯ , 𝑣𝑘 as columns
- For 𝑖 = 1,⋯ 𝑛 , let 𝑦𝑖 ∈ ℝ𝑘 be the vector corresponding to the 𝑖-th row of 𝑉
- Cluster the points {𝑦𝑖 ∈ ℝ𝑘} with k-means algorithm into clusters 𝐶1,⋯ , 𝐶𝑘
- The final output clusters are 𝐴1, ⋯ ,𝐴𝑘 where 𝐴𝑖 = {𝑗|𝑦𝑗 ∈ 𝐶𝑖}
TIPs
上述代码中的相似矩阵W为使用KNN方法,注意此处KNN选取的K个近邻并不等同于聚类方法的K个分类,实践表明KNN方法选取的近邻数量比K个分类多一点即可。近邻数量若较小则影响聚类效果
(4)可视化聚类效果
class my_Kmeans():
def __init__(self,K,threshold):
self.K=K
self.threshold=threshold
def fit(self,X:np.ndarray):
pred,centers=K_means(X,self.K,self.threshold)
return pred
class my_GMM():
def __init__(self,K):
self.K=K
def fit(self,X:np.ndarray):
pred,probability=GMM(X,self.K)
return pred
class my_Spectral_Clusting():
def __init__(self,K):
self.K=K
def fit(self,X:np.ndarray):
pred=Spectral_Clusting(X,self.K)
return pred
print(__doc__)
import time
import warnings
import numpy as np
import matplotlib.pyplot as plt
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice
np.random.seed(0)
# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None
# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)
# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
cluster_std=[1.0, 2.5, 0.5],
random_state=random_state)
# ============
# Set up cluster parameters
# ============
plt.figure(figsize=(3 * 2 + 3, 12.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
hspace=.01)
plot_num = 1
default_base = {'quantile': .3,
'eps': .3,
'damping': .9,
'preference': -200,
'n_neighbors': 10,
'n_clusters': 3,
'min_samples': 20,
'xi': 0.05,
'min_cluster_size': 0.1}
datasets = [
(noisy_circles, {'damping': .77, 'preference': -240,
'quantile': .2, 'n_clusters': 2,
'min_samples': 20, 'xi': 0.25}),
(noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
(varied, {'eps': .18, 'n_neighbors': 2,
'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),
(aniso, {'eps': .15, 'n_neighbors': 2,
'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),
(blobs, {}),
(no_structure, {})]
for i_dataset, (dataset, algo_params) in enumerate(datasets):
# update parameters with dataset-specific values
params = default_base.copy()
params.update(algo_params)
X, y = dataset
# normalize dataset for easier parameter selection
X = StandardScaler().fit_transform(X)
X=np.array(X)
two_means=my_Kmeans(int(params['n_clusters']),0.01)
gmm=my_GMM(int(params['n_clusters']))
spectral=my_Spectral_Clusting(int(params['n_clusters']))
clustering_algorithms = (
('KMeans', two_means),
('SpectralClustering', spectral),
('GaussianMixture', gmm)
)
for name, algorithm in clustering_algorithms:
t0 = time.time()
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
message="the number of connected components of the " +
"connectivity matrix is [0-9]{1,2}" +
" > 1. Completing it to avoid stopping the tree early.",
category=UserWarning)
warnings.filterwarnings(
"ignore",
message="Graph is not fully connected, spectral embedding" +
" may not work as expected.",
category=UserWarning)
y_pred=algorithm.fit(X)
t1 = time.time()
plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
if i_dataset == 0:
plt.title(name, size=18)
colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
'#f781bf', '#a65628', '#984ea3',
'#999999', '#e41a1c', '#dede00']),
int(max(y_pred) + 1))))
# add black color for outliers (if any)
colors = np.append(colors, ["#000000"])
plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
plt.xlim(-2.5, 2.5)
plt.ylim(-2.5, 2.5)
plt.xticks(())
plt.yticks(())
plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
transform=plt.gca().transAxes, size=15,
horizontalalignment='right')
plot_num += 1
plt.show()
受初值影响,聚类效果可能会不同,需要多次尝试得到较好的聚类效果