👨🎓个人主页:研学社的博客
💥💥💞💞欢迎来到本博客❤️❤️💥💥
🏆博主优势:🌞🌞🌞博客内容尽量做到思维缜密,逻辑清晰,为了方便读者。
⛳️座右铭:行百里者,半于九十。
📋📋📋本文目录如下:🎁🎁🎁
目录
💥1 概述
由于现有的单原型聚类算法大多不适用于复杂形状的聚类,因此提出了许多多原型聚类算法。然而,自动估计聚类数量和检测复杂形状仍然具有挑战性,并且解决这些问题通常依赖于用户指定的参数,并且可能非常耗时。该文提出一种基于稳定隶属度的自调优多峰聚类算法(SMMP),无需迭代即可实现快速、自动、有效的多原型聚类。设计一种动态关联-转移方法,应用密度峰聚类技术,学习子聚类生成过程中点对子聚类中心的代表性。根据学习代表性,采用基于边界链路的连通性测度实现子簇的高保真相似性评估。同时,基于合理的聚类在聚类阈值变化时应具有相对稳定的隶属状态的假设,SMMP可以分别自动识别子簇和簇的数量。此外,SMMP 专为大型数据集而设计。在合成数据集和真实数据集上的实验结果证明了SMMP的有效性。
📚2 运行结果
这里仅展现部分结果
部分代码:
function [CL,NC,runtime] = SMMP(data,NC_input) % If you have prior knowledge, you can directly enter the cluster number 'NC_input'
close all;
eta = 0.1; %% used to determine the length of the similarity message vector
fprintf('SMMP Clustering :)!\n');
%% normalization
data=(data-min(data))./(max(data)-min(data));
data(isnan(data))=0;
tic;
%% fast search of KNN matrix based on kd-tree (when dimension is not large than 10)
[n,d] = size(data);
if n>200
max_k = ceil(sqrt(n));
else
max_k = max(15,round(n/10));
end
if d<=11
[knn,knn_dist] = knnsearch(data,data,'k',max_k*2);
else
dist = pdist2(data,data);
[knn_dist,knn] = sort(dist,2);
end
%% adaptive tuning of parameter k
[k] = adaptive_tuning_k(knn,knn_dist,max_k);
%% seting of parameter k_b for our border link detection
k_b = min(round(k/2),2*floor(log(n)));
%% denisty estimation
rho = k*sum(knn_dist(:,2:k).^1,2).^-1; %% within-surrounding-similarity-based density w.r.t 'k'
%% identify density peaks and calculate center-representativeness
theta = ones(n,1); %theta(i): the center-representativeness of point 'i' (initialization)
descendant = zeros(n,1); % descendant(i): the descendant node number of point 'i' (initialization)
[~,OrdRho]=sort(rho,'descend');
for i=1:n
for j=2:k
neigh=knn(OrdRho(i),j);
if(rho(OrdRho(i))<rho(neigh))
NPN(OrdRho(i))=neigh;%% NPN:neigbor-based parent node, i.e., nearest higher density point within the KNN area.
theta(OrdRho(i)) = theta(neigh)* rho(OrdRho(i))/rho(neigh);
descendant(neigh) = descendant(neigh)+1;
break
end
end
end
pk = find(theta==1);%% find density peaks (i.e., sub-cluster centers)
n_pk = length(pk);%% the number of density peaks
%% generate sub-clsuters
sl=-1*ones(n,1); %% sl: sub-labels of points.
sl(pk) = (1:n_pk); %% give unique sub-labels to density peaks.
for i=1:n
if (sl(OrdRho(i))==-1)
sl(OrdRho(i))=sl(NPN(OrdRho(i)));%% inherit sub-labels from NPN
end
end
for i = 1:n_pk
child_sub= descendant(sl==i);
edge(i) = length(find(child_sub==0)); %% edge(i): the edge number of sub-cluster 'i'
end
%% obtain cross-cluster border pairs
borderpair = obtain_borderpairs(sl,k_b,knn,knn_dist);
%% obtain border links
blink = obtain_borderlinks(borderpair);
%% if there is no border link, output sub-clustering result
if isempty(blink)
CL = sl';
NC = n_pk;
runtime = toc;
%% show result
if isshowresult
resultshow(data,CL);
end
return
end
%% else, calculate representativeness of border links for the similarity estimation between subclusters
n_blink = size(blink,1);
simimesgs = cell(n_pk,n_pk); %smeg(i,j): a set of all similarity messages bewteen density peak 'i' and 'j'
for i = 1:n_blink
ii = blink(i,1);
jj = blink(i,2);
pk1 = sl(ii);
pk2 = sl(jj);
smesgs = simimesgs(pk1,pk2);
smesgs{1} = [smesgs{1};(theta(ii)+theta(jj))/2];
simimesgs(pk1,pk2) = smesgs;
simimesgs(pk2,pk1) = smesgs;
end
%% similarity estimation between subclusters
sim = zeros(n_pk,n_pk);
sim_list = [];
for pk1=1:n_pk-1
for pk2 =pk1+1:n_pk
smesgs = simimesgs(pk1,pk2);
smesgs = smesgs{:};
max_smesg = max(smesgs);
min_n_smesg = ceil(min(edge(pk1),edge(pk2))*eta); %% min_n_smesg: the minimum standard number of similarity message samples
smesgs = sort([smesgs;zeros(min_n_smesg,1)],'descend');
smesgs = smesgs(1:min_n_smesg);
if max_smesg>0
Gamma = mean(abs(smesgs - max_smesg))/max_smesg; %%
sim(pk1,pk2) = max_smesg*(1-Gamma);
sim(pk2,pk1) = max_smesg*(1-Gamma);
end
sim_list = [sim_list sim(pk1,pk2)];
end
end
%% Single-linkage clustering of sub-clusters according to SIM
SingleLink = linkage(1-sim_list,'single');
if nargin >= 2 %% case: the number of cluster 'NC' is a priori
NC = NC_input;
else
bata = [0;SingleLink(:,end)];
bata(bata<0)=0;
bataratio = [[n_pk+1-(1:n_pk-1)]' diff(bata)];
bataratio = sortrows(bataratio,2,'descend');
NC = bataratio(1,1); %% the stable number of cluster that with maximum bata-interval.
end
CL_pk = cluster(SingleLink,NC);
%% assign final cluster label
for i=1:n_pk
CL(sl==i) = CL_pk(i); %% CL: the cluster label
end
runtime = toc;
fprintf('Finished!!!!\n');
function [k] = adaptive_tuning_k(knn,knn_dist,max_k)
n = size(knn,1);
n_k = zeros(n,1); %% n_k: the number of different 'k' that satisfy the number of desnity peaks 'n_pk'
k_sum = zeros(n,1); %% k_sum : the sum of different 'k' that satisfy the number of desnity peaks 'n_pk'
for cur_k = 2:ceil(max_k/20):max_k %% ceil(kmax/20): run about 20 times (to reduce computation time)
cur_rho = cur_k*sum(knn_dist(:,2:cur_k).^1,2).^-1; %% within-surrounding-similarity-based density w.r.t 'cur_k'
ispk = ones(n,1); %% density peak label
for i=1:n
for j=2:cur_k
if cur_rho(i)< cur_rho(knn(i,j))
ispk(i)=0; %% not a density peak
break
end
end
end
n_pk = length(find(ispk==1)); %% n_pk: the number of denisty peak w.r.t 'cur_k'
n_k(n_pk) = n_k(n_pk)+1;
k_sum(n_pk) = k_sum(n_pk) + cur_k;
end
stb_n_pk = find(n_k==max(n_k)); %%stb_n_pk: the stable number of density peaks that with the maximum k-interval.
stb_n_pk = stb_n_pk(1);
k = round((k_sum(stb_n_pk)/n_k(stb_n_pk))); %% obtain our parameter $k$
function [borderpair] = obtain_borderpairs(sl,k_b,knn,knn_dist)
borderpair = [];
n = length(sl);
for i=1:n
label_i = sl(i);
for j = 2:k_b
i_nei = knn(i,j);
dist_i_nei = knn_dist(i,j);
label_nei = sl(i_nei);
if label_i ~= label_nei & find(knn(i_nei,2:k_b)==i)
borderpair = [borderpair;[i i_nei dist_i_nei]];
break
end
end
end
function [blink] = obtain_borderlinks(borderpair)
if isempty(borderpair)
blink = [];
else
borderpair(:,1:2) = sort(borderpair(:,1:2),2);
[~,index] = unique(borderpair(:,3));
borderpair = borderpair(index,:);
borderpair = sortrows(borderpair,3);
n_pairs = size(borderpair,1);
blink = []; %% blink: border link
for i = 1:n_pairs
bp = borderpair(i,1:2);
if isempty(intersect(bp,blink))
blink = [blink;bp];
end
end
end
🌈3 Matlab代码实现
🎉4 参考文献
部分理论来源于网络,如有侵权请联系删除。
[1] 'A Novel Approach for Automatic Number of Clusters Detection based on Consensus Clustering', N.X. Vinh, and Epps, J., in Procs. IEEE Int. Conf. on Bioinformatics and Bioengineering (Taipei, Taiwan), 2009.
[2] 'Information Theoretic Measures for Clusterings Comparison: Is a Correction for Chance Necessary?', N.X. Vinh, Epps, J. and Bailey, J., in Procs. the 26th International Conference on Machine Learning (ICML'09)
[3] 'Information Theoretic Measures for Clusterings Comparison: Variants, Properties, Normalization and Correction for Chance', N.X. Vinh, Epps, J. and Bailey, J., Journal of Machine Learning Research, 11(Oct), pages 2837-2854, 2010
[4]Junyi Guan, Sheng Li, Xiongxiong He, Jinhui Zhu, Jiajia Chen, Peng Si (2022) SMMP: A Stable-Membership-based Auto-tuning Multi- Peak Clustering Algorithm