pointcloud_homework3

最新推荐文章于 2023-11-23 17:22:14 发布

dtuyg

最新推荐文章于 2023-11-23 17:22:14 发布

阅读量656

点赞数 1

本文链接：https://blog.csdn.net/dtuyg/article/details/105930642

版权

task
Generate clustering dataset using sklearn
• https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html
Implement your own version of
• K-Means
• GMM
• Spectral Clustering
Visualize and compare the results with the standard results
• https://scikit-learn.org/stable/modules/clustering.html#overview-of-clustering-methods

solution

在google colab上完成并运行

1.sklearn的聚类方法效果

print(__doc__)

import time
import warnings

import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

np.random.seed(0)

# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=random_state)

# ============
# Set up cluster parameters
# ============
plt.figure(figsize=(9 * 2 + 3, 12.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                    hspace=.01)

plot_num = 1

default_base = {'quantile': .3,
                'eps': .3,
                'damping': .9,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 3,
                'min_samples': 20,
                'xi': 0.05,
                'min_cluster_size': 0.1}

datasets = [
    (noisy_circles, {'damping': .77, 'preference': -240,
                     'quantile': .2, 'n_clusters': 2,
                     'min_samples': 20, 'xi': 0.25}),
    (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
    (varied, {'eps': .18, 'n_neighbors': 2,
              'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),
    (aniso, {'eps': .15, 'n_neighbors': 2,
             'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),
    (blobs, {}),
    (no_structure, {})]

for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)

    X, y = dataset

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(
        X, n_neighbors=params['n_neighbors'], include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # Create cluster objects
    # ============
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(
        n_clusters=params['n_clusters'], linkage='ward',
        connectivity=connectivity)
    spectral = cluster.SpectralClustering(
        n_clusters=params['n_clusters'], eigen_solver='arpack',
        affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    optics = cluster.OPTICS(min_samples=params['min_samples'],
                            xi=params['xi'],
                            min_cluster_size=params['min_cluster_size'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average", affinity="cityblock",
        n_clusters=params['n_clusters'], connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(
        n_components=params['n_clusters'], covariance_type='full')

    clustering_algorithms = (
        ('MiniBatchKMeans', two_means),
        ('AffinityPropagation', affinity_propagation),
        ('MeanShift', ms),
        ('SpectralClustering', spectral),
        ('Ward', ward),
        ('AgglomerativeClustering', average_linkage),
        ('DBSCAN', dbscan),
        ('OPTICS', optics),
        ('Birch', birch),
        ('GaussianMixture', gmm)
    )

    for name, algorithm in clustering_algorithms:
        t0 = time.time()

        # catch warnings related to kneighbors_graph
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="the number of connected components of the " +
                "connectivity matrix is [0-9]{1,2}" +
                " > 1. Completing it to avoid stopping the tree early.",
                category=UserWarning)
            warnings.filterwarnings(
                "ignore",
                message="Graph is not fully connected, spectral embedding" +
                " may not work as expected.",
                category=UserWarning)
            algorithm.fit(X)

        t1 = time.time()
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)

        plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)

        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                      int(max(y_pred) + 1))))
        # add black color for outliers (if any)
        colors = np.append(colors, ["#000000"])
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
        plot_num += 1

plt.show()

在这里插入图片描述

2.自己版本的三个聚类方法的实现

(1)Kmeans


import numpy as np
import random

def K_means(X:np.ndarray,K:int,threshold:float):
    #初始化
    indset=list(random.sample(range(0,X.shape[0]),K))

    centerset=[]  
    for ind in indset:
        centerset.append(X[ind])
    centerset=np.array(centerset)
    #centerset=np.array([[-5,-5],[7,6],[4,0]])
    newcenterset=np.copy(centerset)
    pred=np.array(np.random.randint(0,K,size=X.shape[0]))
    stable=False
    
    while stable==False:
        # E step
        for i,point in enumerate(X):
            diff=np.linalg.norm(np.expand_dims(point, 0)-centerset,axis=1)
            nn_idx = np.argsort(diff)
            pred[i]=nn_idx[0]
        # M step
        for indvalue in range(len(centerset)):
            indexset=np.where(pred==indvalue)
            
            newcenterset[indvalue]=np.mean(X[indexset],0)
                 
        moves=np.linalg.norm(newcenterset-centerset,axis=1)
        m_ids=np.argsort(moves)
        #print(m_ids[-1])
        if moves[m_ids[-1]]<=threshold:
            stable=True

        centerset=np.copy(newcenterset)
        
    
    return pred,centerset

𝑁 input data points, find 𝐾 clusters.

Randomly select K center points
Each data point is assigned to one of
the K centers.
Re-compute the K centers by the mean
of each group
Iterate step 2 & 3.

TIPs：
1.Kmeans方法对初值比较敏感，上述代码的初值中心点为随机选择,可能会出现聚类效果很差的情况，需要多次尝试可得到较好的聚类效果

2.这里判断EM循环步骤的中止条件的依据为新旧中心点的欧式距离，小于一个阈值则停止，阈值的大小需要在数据集的基础上合理选择

(2)GMM


import numpy as np
import random
import math

def Guass(x:np.ndarray,miu:np.ndarray,sigma:np.ndarray):
    D=x.shape[1]
    c1=np.matmul((x-miu).T,np.linalg.inv(sigma))
    c2=np.matmul(c1,(x-miu))
    #print(sigma,np.linalg.det(sigma),math.pow(abs(np.linalg.det(sigma)),-0.5))
    return math.pow(2*np.pi,-D/2)*math.pow(abs(np.linalg.det(sigma)),-0.5)*np.exp(-0.5*c2)

def GMM(X:np.ndarray,K:int,MAX_MLE_value=-1e10,point_seq=1e-4,mle_seq=1e-5,EMcountMAX=150):
    if X.shape[0]<=X.shape[1]:
        return None
    
    #初始化
    stop=False
    EMcount=0
    X=X.T
    miu=[]
    sigma=[]
    pi=[]
    for _ in range(K):
        tempn=K+3
        
        if tempn <=X.shape[0] or tempn>=X.shape[1]:
            tempn=X.shape[1]-1
        
        tempX=X[:,random.sample(range(0,X.shape[1]),tempn)]
        miu.append(np.expand_dims(np.mean(tempX,axis=1),axis=1))
        sigma.append(np.cov(tempX))
        pi.append(np.random.rand())
    psum=sum(pi)
    for i in range(len(pi)):
        pi[i]=pi[i]/psum
    miu=np.array(miu)
    sigma=np.array(sigma)
    pi=np.array(pi)
    pointnums=X.shape[1]
    probability=np.zeros((pointnums,K))
    
    
    while stop==False:
    
        
        MLE_value=0
        #E step
        for i in range(pointnums):
            x=np.expand_dims(X[:,i],axis=1)
            tempsum=0
            for k in range(K):
                if np.linalg.norm(x-miu[k])<point_seq:#为解决奇点问题
                    tempsum+=pi[k]*Guass(x+point_seq,miu[k],sigma[k])[0][0]
                else:
                    tempsum+=pi[k]*Guass(x,miu[k],sigma[k])[0][0]
            for k in range(K):
                if np.linalg.norm(x-miu[k])<point_seq:#为解决奇点问题
                    probability[i][k]=pi[k]*Guass(x+point_seq,miu[k],sigma[k])[0][0]/tempsum
                else:
                    probability[i][k]=pi[k]*Guass(x,miu[k],sigma[k])[0][0]/tempsum
            MLE_value+=math.log(tempsum)
        
        
            
            
                
        #M step
        for k in range(K):
            N_k=sum(probability[:,k])

    #         for i in range(X.shape[1]):
    #             x=np.expand_dims(X[:,i],axis=1)
            gamaset=np.multiply(X,probability[:,k])
            miu[k]=np.expand_dims(gamaset.sum(axis=1),axis=1)/N_k
            tempsigma=np.zeros((X.shape[0],X.shape[0]))
            for i in range(pointnums):
                x=np.expand_dims(X[:,i],axis=1)
                tempsigma+=probability[i,k]*np.matmul(x-miu[k],(x-miu[k]).T)
            sigma[k]=tempsigma/N_k
            pi[k]=N_k/pointnums
            
        
        EMcount+=1
        if MLE_value>MAX_MLE_value and EMcount<EMcountMAX:
            if np.abs(MLE_value-MAX_MLE_value)<mle_seq:
                stop=True
            MAX_MLE_value=MLE_value
            
        else:
            stop=True
            
            
        
    pred=np.zeros(pointnums)
    pred=np.random.randint(0,K,size=pointnums)
    for i in range(pointnums):
        temppro=list(probability[i,:])
        pred[i]=int(temppro.index(max(temppro)))
     

    
    return pred,probability

Initialize the means 𝜇𝑘, covariances Σ𝑘 and weights 𝜋𝑘
E-step. Evaluate the posterior 𝑝(𝑧𝑛𝑘 = 1|𝑥𝑛), intuitively this is the probability of 𝑥𝑛 being
assigned to each of the K clusters

$\gamma\left(z_{n k}\right)=\frac{\pi_{k} \mathcal{N}\left(\mathbf{x}_{n} | \boldsymbol{\mu}_{k}, \mathbf{\Sigma}_{k}\right)}{\sum_{j=1}^{K} \pi_{j} \mathcal{N}\left(\mathbf{x}_{n} | \boldsymbol{\mu}_{j}, \mathbf{\Sigma}_{j}\right)}$

M-Step. Estimate the parameters using MLE.
Evaluate the log likelihood, if converges, stop. Otherwise go back to E-step

$N_{k}=\sum_{n=1}^{N} \gamma\left(z_{n k}\right)$

$\boldsymbol{\mu}_{k}^{\text {new }}=\frac{1}{N_{k}} \sum_{n=1}^{N} \gamma\left(z_{n k}\right) \mathbf{x}_{n}$
$\boldsymbol{\Sigma}_{k}^{\text {new }}=\frac{1}{N_{k}} \sum_{n=1}^{N} \gamma\left(z_{n k}\right)\left(\mathbf{x}_{n}-\boldsymbol{\mu}_{k}^{\text {new }}\right)\left(\mathbf{x}_{n}-\boldsymbol{\mu}_{k}^{\text {new }}\right)^{\text {T }}$
$\pi_{k}^{\text {new }}=\frac{N_{k}}{N}$

$\ln p(\mathbf{X} | \boldsymbol{\mu}, \boldsymbol{\Sigma}, \boldsymbol{\pi})=\sum_{n=1}^{N} \ln \left\{\sum_{k=1}^{K} \pi_{k} \mathcal{N}\left(\mathbf{x}_{n} | \boldsymbol{\mu}_{k}, \boldsymbol{\Sigma}_{k}\right)\right\}$

TIPs:
1.获取初值时，上述方法为随机选点然后计算均值、协方差矩阵等。选取的点的数量不能太少，至少需要大于点的维数，即m>n；同时也不能太多，GMM方法同样对初值敏感，点太多计算出来的初值基本覆盖了所有数据，很难再分类
2.上述判断EM循环中止的条件为最大似然函数 $\ln p(\mathbf{X} | \boldsymbol{\mu}, \boldsymbol{\Sigma}, \boldsymbol{\pi})=\sum_{n=1}^{N} \ln \left\{\sum_{k=1}^{K} \pi_{k} \mathcal{N}\left(\mathbf{x}_{n} | \boldsymbol{\mu}_{k}, \boldsymbol{\Sigma}_{k}\right)\right\}$ ，使其一直保持递增，否则循环中止；同时设置最大循环次数防止其无穷递增
3.上述代码还未很好地处理奇点问题，需要改进

(3)Spectral Clustering


import numpy as np
import random
from scipy import spatial
from sklearn.cluster import KMeans


def Spectral_Clusting(X:np.ndarray,K:int,add_knum=5):
    pointnums=X.shape[0]
    W=np.zeros((pointnums,pointnums))
    sigma=1
    knum=K+add_knum
    if knum+1 >=pointnums:
        knum=pointnums-1
    tree = spatial.KDTree(X)
    for i in range(pointnums):
        point=X[i]
        scipy_nn_dis,scipy_nn_idx=tree.query(point,knum+1)
        #近邻的个数若太少会影响聚类效果，太多则计算更复杂
        scipy_nn_idx=scipy_nn_idx[1:]
        scipy_nn_dis=scipy_nn_dis[1:]
        for k in range(len(scipy_nn_idx)):
            j=scipy_nn_idx[k]
            W[j][i]=W[i][j]=np.exp(-(scipy_nn_dis[k]**2)/(2*sigma**2))
            
            
    
    dset=[]
    for i in range(W.shape[0]):
        dset.append(sum(W[i,:]))
    D=np.diag(dset)
    
    L=D-W
    #normailze: D^(-1/2) L D^(-1/2)
    sqrtDegreeMatrix = np.diag(1.0 / (np.array(dset) ** (0.5)))
    L=np.dot(np.dot(sqrtDegreeMatrix, L), sqrtDegreeMatrix)
    
    e_vals,e_vecs = np.linalg.eig(L)
    
    sorted_indices = np.argsort(e_vals)
    V=e_vecs[:,sorted_indices[:K]]
    
    p,_=K_means(V,K,0.01)
    #p = KMeans(n_clusters=K).fit_predict(V)
    
    return p,V

Build the graph to get adjacency matrix 𝑊 ∈ ℝ𝑛×𝑛
Compute normalized Laplacian 𝐿′ = 𝐿𝑟𝑤
Compute the first (smallest) 𝑘 eigenvectors 𝑣1, ⋯ , 𝑣𝑘 of 𝐿′
Let 𝑉 ∈ ℝ𝑛×𝑘 be the matrix contraining the vectors 𝑣1,⋯ , 𝑣𝑘 as columns
For 𝑖 = 1,⋯ 𝑛 , let 𝑦𝑖 ∈ ℝ𝑘 be the vector corresponding to the 𝑖-th row of 𝑉
Cluster the points {𝑦𝑖 ∈ ℝ𝑘} with k-means algorithm into clusters 𝐶1,⋯ , 𝐶𝑘
The final output clusters are 𝐴1, ⋯ ,𝐴𝑘 where 𝐴𝑖 = {𝑗|𝑦𝑗 ∈ 𝐶𝑖}

TIPs
上述代码中的相似矩阵W为使用KNN方法，注意此处KNN选取的K个近邻并不等同于聚类方法的K个分类，实践表明KNN方法选取的近邻数量比K个分类多一点即可。近邻数量若较小则影响聚类效果

(4)可视化聚类效果


class my_Kmeans():
    def __init__(self,K,threshold):
        self.K=K
        self.threshold=threshold
        
    def fit(self,X:np.ndarray):
        pred,centers=K_means(X,self.K,self.threshold)
        return pred
    
class my_GMM():
    def __init__(self,K):
        self.K=K
        
        
    def fit(self,X:np.ndarray):
        pred,probability=GMM(X,self.K)
        return pred

    
class my_Spectral_Clusting():
    def __init__(self,K):
        self.K=K
        
        
    def fit(self,X:np.ndarray):
        pred=Spectral_Clusting(X,self.K)
        return pred


print(__doc__)

import time
import warnings

import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

np.random.seed(0)

# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

# Anisotropicly distributed data
random_state = 170
X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

# blobs with varied variances
varied = datasets.make_blobs(n_samples=n_samples,
                             cluster_std=[1.0, 2.5, 0.5],
                             random_state=random_state)

# ============
# Set up cluster parameters
# ============
plt.figure(figsize=(3 * 2 + 3, 12.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                    hspace=.01)

plot_num = 1

default_base = {'quantile': .3,
                'eps': .3,
                'damping': .9,
                'preference': -200,
                'n_neighbors': 10,
                'n_clusters': 3,
                'min_samples': 20,
                'xi': 0.05,
                'min_cluster_size': 0.1}

datasets = [
    (noisy_circles, {'damping': .77, 'preference': -240,
                     'quantile': .2, 'n_clusters': 2,
                     'min_samples': 20, 'xi': 0.25}),
    (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
    (varied, {'eps': .18, 'n_neighbors': 2,
              'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),
    (aniso, {'eps': .15, 'n_neighbors': 2,
             'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),
    (blobs, {}),
    (no_structure, {})]

for i_dataset, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algo_params)

    X, y = dataset

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)
    X=np.array(X)
    

    two_means=my_Kmeans(int(params['n_clusters']),0.01)
    gmm=my_GMM(int(params['n_clusters']))
    spectral=my_Spectral_Clusting(int(params['n_clusters']))
    
    clustering_algorithms = (
        ('KMeans', two_means),
        ('SpectralClustering', spectral),
        ('GaussianMixture', gmm)
    )

    for name, algorithm in clustering_algorithms:
        t0 = time.time()

        # catch warnings related to kneighbors_graph
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="the number of connected components of the " +
                "connectivity matrix is [0-9]{1,2}" +
                " > 1. Completing it to avoid stopping the tree early.",
                category=UserWarning)
            warnings.filterwarnings(
                "ignore",
                message="Graph is not fully connected, spectral embedding" +
                " may not work as expected.",
                category=UserWarning)
            y_pred=algorithm.fit(X)

        t1 = time.time()

        plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)

        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                      int(max(y_pred) + 1))))
        # add black color for outliers (if any)
        colors = np.append(colors, ["#000000"])
        plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        plt.xticks(())
        plt.yticks(())
        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
        plot_num += 1

plt.show()

在这里插入图片描述
受初值影响，聚类效果可能会不同，需要多次尝试得到较好的聚类效果

dtuyg

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
pointcloud_homework3

taskGenerate clustering dataset using sklearn• https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.htmlImplement your own version of• K-Means• GMM• Spectral Clustering...
复制链接

扫一扫