k-medoids 聚类中距离度量函数的选取对聚类效果的影响

最新推荐文章于 2022-05-03 17:33:28 发布

脱离了高级趣味♂

最新推荐文章于 2022-05-03 17:33:28 发布

阅读量1.4k

点赞数

文章标签：聚类聚类算法 kmeans算法

本文链接：https://blog.csdn.net/qq_45424267/article/details/116274659

版权

数据聚类的一般流程为：选定特征、确定相似度、选择聚类算法、聚类结果评估。在此次试验中，各步骤具体采用方法如下：

选定特征
生成满足二维高斯分布的集合，特征为[x,y]
确定相似度(距离度量) ：若有两个样本点 $x_1,y_1),(x_2,y_2)$
（1）欧式距离： $=\sqrt{(x_1-x_2)^2 +(y_1-y_2)^2 }$
（2）曼哈顿距离: $\left| x_1-x_2\right|+\left|y_1-y_2\right|$
选择聚类算法
k-medoids
聚类结果评估
无(肉眼看图)

K-medoids算法流程
- 随机选择k个样本作为初始簇中心
- 对余下的样本，划分到距离它最近的簇中心所在的簇中
- 重新计算簇中心，对于每个簇$C_k $，找到一个样本点，使得其到簇中其余所有样本点的距离之和最小，即
  $\argmin_{x \in c_k} \sum_{x'\in C_k}^{} \left\| x-x' \right\|^2$
- 如果各簇中心不再变化，则算法收敛，否则循环步骤2、3

matlab程序：

clc
clear
rng default
set1 = mvnrnd([0 0],[0.1 0; 0 0.1],2000);
set2 = mvnrnd([1.5 1],[0.07 0; 0 1],2000);
set = [set1; set2];
% 原图绘制
subplot(2,2,1)
scatter(set1(:,1),set1(:,2),'.');
hold on
scatter(set2(:,1),set2(:,2),'.');
title(["原数据"])
axis equal

% 合并两集合
subplot(2,2,2)
set = [set1;set2];
scatter(set(:,1),set(:,2),'.');
title(["混合数据"])
axis equal
%% 聚类
k = 2;                              % 聚类数
meanIndex = randi(size(set,1),1,k); % 随机生成1~4000(样本数)范围的 1*k大小的矩阵
meanPoint = set(meanIndex,:);
lastPoint = meanPoint;
times=1;
str = {'','','欧式距离','曼哈顿距离'};%str{choice}可提取出相应字符串
for choice = 3:4  % choice：3采用欧式距离，4采用曼哈顿距离
    while 1
        %————————聚类(上标签)——————————%
        labelSet = [];
        for i=1:size(set,1)  %对于所有样本点，此例4000
            switch choice
                case 3
                %————度量函数一 欧式距离:(x1-x2)^2+(y1-y2)^2——————————
                d = [];
                for j= 1:k       %对于每个样本点，计算该点到k个簇中心的距离，d是1*k的矩阵
                    d = [d,(meanPoint(j,:) - set(i,:))*(meanPoint(j,:) - set(i,:))'];
                end
                [Y,I] = min(d);  %样本点到某个簇中心的最小距离，对应下标即簇的编号
                case 4
                %———评价函数二 曼哈顿距离：|x1-x2|+|y1-y2|————————
                [~,I] = min(sum(abs((set(i,:)-meanPoint))'));
            end
            labelSet = [labelSet;set(i,:),I];
        end

        %——————————计算各簇中心点——————————————%
        for i=1:k            %对于k个聚类簇
            subplot(2,2,choice)
            scatter(labelSet(find(labelSet(:,3)==i),1)',labelSet(find(labelSet(:,3)==i),2)',1,labelSet(find(labelSet(:,3)==i),3)');
            sameKindSet = labelSet(find(labelSet(:,3)==i),1:2);%同一个簇的所有样本点
            % 计算同一个簇中，任意一点到其余所有点的代价，选取其中最小的作为该簇中心点
            [~,I] = min(sum(squareform(pdist(sameKindSet))));     %信息量较大，等价如下几行
    %         d3 = [];
    %         for ii = 1:size(sameKindSet,1)
    %              temp = sameKindSet(ii,:) - sameKindSet(:,:);   %自动广播  (x1-x2),(y1-y2)
    %              temp = temp.^2;                                % (x1-x2)^2,(y1-y2)^2
    %              temp = sum(temp,2);                            %按行求和，计算(x1-x2)^2+(y1-y2)^2
    %              temp = sum(temp);
    %             d3 = [d3 temp];
    %         end
    %         [~,I] = min(d3);

            meanPoint(i,:) = sameKindSet(I,:);
        end
        title([str{choice},num2str(times),'次迭代至收敛']);
        axis equal
        hold on

        if lastPoint == meanPoint
            break
        end
        lastPoint = meanPoint;
        times=times+1;
    end
end

在这里插入图片描述