基于稳定隶属度的自调优多峰聚类算法(SMMP)(SMMP)(Matlab代码实现)

 👨‍🎓个人主页:研学社的博客 

💥💥💞💞欢迎来到本博客❤️❤️💥💥

🏆博主优势:🌞🌞🌞博客内容尽量做到思维缜密,逻辑清晰,为了方便读者。

⛳️座右铭:行百里者,半于九十。

📋📋📋本文目录如下:🎁🎁🎁

目录

💥1 概述

📚2 运行结果

🌈3 Matlab代码实现

🎉4 参考文献


💥1 概述

由于现有的单原型聚类算法大多不适用于复杂形状的聚类,因此提出了许多多原型聚类算法。然而,自动估计聚类数量和检测复杂形状仍然具有挑战性,并且解决这些问题通常依赖于用户指定的参数,并且可能非常耗时。该文提出一种基于稳定隶属度的自调优多峰聚类算法(SMMP),无需迭代即可实现快速、自动、有效的多原型聚类。设计一种动态关联-转移方法,应用密度峰聚类技术,学习子聚类生成过程中点对子聚类中心的代表性。根据学习代表性,采用基于边界链路的连通性测度实现子簇的高保真相似性评估。同时,基于合理的聚类在聚类阈值变化时应具有相对稳定的隶属状态的假设,SMMP可以分别自动识别子簇和簇的数量。此外,SMMP 专为大型数据集而设计。在合成数据集和真实数据集上的实验结果证明了SMMP的有效性。

📚2 运行结果

这里仅展现部分结果 

部分代码:

function [CL,NC,runtime] = SMMP(data,NC_input) % If you have prior knowledge, you can directly enter the cluster number 'NC_input'
close all;
eta = 0.1; %% used to determine the length of the similarity message vector
fprintf('SMMP Clustering :)!\n');
%% normalization
data=(data-min(data))./(max(data)-min(data));
data(isnan(data))=0;
tic;
%% fast search of KNN matrix based on kd-tree (when dimension is not large than 10)
[n,d]  = size(data);
if n>200
    max_k = ceil(sqrt(n));
else
    max_k = max(15,round(n/10));
end
if d<=11
    [knn,knn_dist] = knnsearch(data,data,'k',max_k*2);
else
    dist = pdist2(data,data);
    [knn_dist,knn] = sort(dist,2);
end

%% adaptive tuning of parameter k
[k] = adaptive_tuning_k(knn,knn_dist,max_k);

%% seting of parameter k_b for our border link detection
k_b = min(round(k/2),2*floor(log(n)));

%% denisty estimation
rho = k*sum(knn_dist(:,2:k).^1,2).^-1; %% within-surrounding-similarity-based density w.r.t 'k'

%% identify density peaks and calculate center-representativeness
theta = ones(n,1); %theta(i): the center-representativeness of point 'i' (initialization)
descendant = zeros(n,1); % descendant(i): the descendant node number of point 'i' (initialization)
[~,OrdRho]=sort(rho,'descend');
for i=1:n
    for j=2:k
        neigh=knn(OrdRho(i),j);
        if(rho(OrdRho(i))<rho(neigh))
            NPN(OrdRho(i))=neigh;%% NPN:neigbor-based parent node, i.e., nearest higher density point within the KNN area.
            theta(OrdRho(i)) = theta(neigh)* rho(OrdRho(i))/rho(neigh);
            descendant(neigh) = descendant(neigh)+1;
            break
        end
    end
end
pk = find(theta==1);%% find density peaks (i.e., sub-cluster centers)
n_pk = length(pk);%% the number of density peaks

%% generate sub-clsuters
sl=-1*ones(n,1); %% sl: sub-labels of points.
sl(pk) = (1:n_pk); %% give unique sub-labels to density peaks.
for i=1:n
    if (sl(OrdRho(i))==-1)
        sl(OrdRho(i))=sl(NPN(OrdRho(i)));%% inherit sub-labels from NPN
    end
end
for i = 1:n_pk
    child_sub= descendant(sl==i);
    edge(i) = length(find(child_sub==0)); %% edge(i): the edge number of sub-cluster 'i'
end

%% obtain cross-cluster border pairs
borderpair = obtain_borderpairs(sl,k_b,knn,knn_dist);

%% obtain border links
blink = obtain_borderlinks(borderpair);

%% if there is no border link, output sub-clustering result
if isempty(blink)
    CL = sl';
    NC = n_pk;
    runtime = toc;
    %% show result
    if isshowresult
        resultshow(data,CL);
    end
    return
end

%% else, calculate representativeness of border links for the similarity estimation between subclusters
n_blink = size(blink,1);
simimesgs = cell(n_pk,n_pk); %smeg(i,j): a set of all similarity messages bewteen density peak 'i' and 'j'
for i = 1:n_blink
    ii = blink(i,1);
    jj = blink(i,2);
    pk1 = sl(ii);
    pk2 = sl(jj);
    smesgs = simimesgs(pk1,pk2);
    smesgs{1} = [smesgs{1};(theta(ii)+theta(jj))/2];
    simimesgs(pk1,pk2) = smesgs;
    simimesgs(pk2,pk1) = smesgs;
end

%% similarity estimation between subclusters
sim = zeros(n_pk,n_pk);
sim_list = [];
for pk1=1:n_pk-1
    for pk2 =pk1+1:n_pk
        smesgs = simimesgs(pk1,pk2);
        smesgs = smesgs{:};
        max_smesg = max(smesgs);
        min_n_smesg = ceil(min(edge(pk1),edge(pk2))*eta); %% min_n_smesg: the minimum standard number of similarity message samples
        smesgs = sort([smesgs;zeros(min_n_smesg,1)],'descend');
        smesgs = smesgs(1:min_n_smesg);
        if max_smesg>0
            Gamma = mean(abs(smesgs - max_smesg))/max_smesg; %%
            sim(pk1,pk2) = max_smesg*(1-Gamma);
            sim(pk2,pk1) = max_smesg*(1-Gamma);
        end
        sim_list = [sim_list sim(pk1,pk2)];
    end
end

%% Single-linkage clustering of sub-clusters according to SIM
SingleLink = linkage(1-sim_list,'single');
if nargin >= 2 %% case: the number of cluster 'NC' is a priori
    NC = NC_input;
else
    bata = [0;SingleLink(:,end)];
    bata(bata<0)=0;
    bataratio = [[n_pk+1-(1:n_pk-1)]' diff(bata)];
    bataratio = sortrows(bataratio,2,'descend');
    NC  = bataratio(1,1); %% the stable number of cluster that with maximum bata-interval.
end
CL_pk = cluster(SingleLink,NC);

%% assign final cluster label
for i=1:n_pk
    CL(sl==i) = CL_pk(i); %% CL: the cluster label
end
runtime = toc;

fprintf('Finished!!!!\n');


function [k] = adaptive_tuning_k(knn,knn_dist,max_k)
n = size(knn,1);
n_k = zeros(n,1);  %% n_k: the number of different 'k' that satisfy the number of desnity peaks 'n_pk'
k_sum = zeros(n,1);  %% k_sum : the sum of different 'k' that satisfy the number of desnity peaks 'n_pk'
for cur_k = 2:ceil(max_k/20):max_k %% ceil(kmax/20): run about 20 times (to reduce computation time)
    cur_rho = cur_k*sum(knn_dist(:,2:cur_k).^1,2).^-1; %% within-surrounding-similarity-based density w.r.t 'cur_k'
    ispk = ones(n,1); %% density peak label
    for i=1:n
        for j=2:cur_k
            if cur_rho(i)< cur_rho(knn(i,j))
                ispk(i)=0; %% not a density peak
                break
            end
        end
    end
    n_pk = length(find(ispk==1)); %% n_pk: the number of denisty peak w.r.t 'cur_k'
    n_k(n_pk) = n_k(n_pk)+1;
    k_sum(n_pk) = k_sum(n_pk) + cur_k;
end
stb_n_pk = find(n_k==max(n_k)); %%stb_n_pk: the stable number of density peaks that with the maximum k-interval.
stb_n_pk = stb_n_pk(1);
k = round((k_sum(stb_n_pk)/n_k(stb_n_pk))); %% obtain our parameter $k$

function [borderpair] = obtain_borderpairs(sl,k_b,knn,knn_dist)
borderpair = [];
n = length(sl);
for i=1:n
    label_i = sl(i);
    for j = 2:k_b
        i_nei = knn(i,j);
        dist_i_nei = knn_dist(i,j);
        label_nei = sl(i_nei);
        if label_i ~= label_nei & find(knn(i_nei,2:k_b)==i)
            borderpair = [borderpair;[i i_nei dist_i_nei]];
            break
        end
    end
end

function [blink] = obtain_borderlinks(borderpair)
if isempty(borderpair)
    blink = [];
else
    borderpair(:,1:2) = sort(borderpair(:,1:2),2);
    [~,index] = unique(borderpair(:,3));
    borderpair = borderpair(index,:);
    borderpair = sortrows(borderpair,3);
    n_pairs = size(borderpair,1);
    blink = []; %% blink: border link
    for i = 1:n_pairs
        bp = borderpair(i,1:2);
        if isempty(intersect(bp,blink))
            blink = [blink;bp];
        end
    end
end

🌈3 Matlab代码实现

🎉4 参考文献

部分理论来源于网络,如有侵权请联系删除。

[1] 'A Novel Approach for Automatic Number of Clusters Detection based on Consensus Clustering', N.X. Vinh, and Epps, J., in Procs. IEEE Int. Conf. on Bioinformatics and Bioengineering (Taipei, Taiwan), 2009.
[2] 'Information Theoretic Measures for Clusterings Comparison: Is a Correction for Chance Necessary?', N.X. Vinh, Epps, J. and Bailey, J., in Procs. the 26th International Conference on Machine Learning (ICML'09)
[3] 'Information Theoretic Measures for Clusterings Comparison: Variants, Properties,   Normalization and Correction for Chance', N.X. Vinh, Epps, J. and  Bailey, J., Journal of Machine Learning Research, 11(Oct), pages 2837-2854, 2010
[4]Junyi Guan, Sheng Li, Xiongxiong He, Jinhui Zhu, Jiajia Chen, Peng Si (2022) SMMP: A Stable-Membership-based Auto-tuning Multi- Peak Clustering Algorithm

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

荔枝科研社

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值