机器学习算法之聚类算法(密度聚类)原理+手动实现

机器学习算法之聚类算法(密度聚类)原理+手动实现

前言

笔者在最近完成了一道数据挖掘课程的有关密度聚类的小实验,为了实现DBSCAN算法,笔者查询了多篇博客,学习他们对DBSCAN方法的实现步骤设计,复现时发现总是与调库计算的结果存在些许误差(这里把笔者debug快de疯了,最后总结原因:部分博客描述的算法过程有误或者存在歧义,导致笔者的复现失败。),最终笔者想到了西瓜书,通过查询书中对DBSCAN算法的步骤介绍,从而成功复现了该聚类算法。
在此之前自己查询网上有关DBSCAN的说明,写了三个版本的DBSCAN都失败了,也是为了让大家少踩雷,故笔者打算将西瓜书中的DBSCAN算法步骤及实现代码进行分享。

介绍密度聚类方法DBSCAN

首先是对DBSCAN的一个总体介绍
在这里插入图片描述
学习DBSCAN所需要了解的前提知识:
在这里插入图片描述
在这里插入图片描述

介绍DBSCAN伪代码

在这个部分,就是笔者疯狂踩雷的部分了,看了三种网上的方法,无一成功复现。(也许是笔者太菜了)在这里我介绍一种我认为最靠谱的,当然我自己也成功复现了的流程——西瓜书上对DBSCAN的介绍,如图所示:
在这里插入图片描述

介绍DBSCAN手动实现代码(Python)

接下来就是我自己写的python版本的DBSCAN了。
说在前面:笔者还未对ZWDBSCAN类中的MicroF1、MacroF1、和轮廓系数S进行测试,不一定有效,此部分内容仅供参考,明早笔者会进行测试,如果不行的话会及时修改并上传的。但聚类部分笔者是经过测试了,确保和调库计算的结果能够一样。

Micro、Macro和轮廓系数已经更新了,但需要在保证聚类簇数和数据给定的真实标签数目相同时使用(通过调节eps和minPoints做到)不然会报错,后续如果需要改进的话会及时更新

更新完毕。两种F1平均算法与调库计算结果完全一致,轮廓系数会与调库有少许不同测试结果有0.5736—0.5693,0.2165—0.1971,应该也没有问题

直接见代码吧:

import numpy as np

import pandas as pd

import math

from sklearn.cluster import DBSCAN

import sklearn.metrics as sm

class ZWDBSCAN(object):

    """
    eps: 密度聚类(可连接)的最大距离
    minPoints: 簇的样本最小数量
    labels: 数据的聚类标签
    center_point: 聚类时的核心点
    edge_point: 聚类时的边缘点
    datas: 全部样本数据
    MicroF1: Micro-F1系数
    MacroF1: Macro-F1系数
    S_coef: 轮廓系数S
    """

    """
    假定fit训练的类别数和真实类别数相等,这样方便计算Micro和Macro
    以真实类别数共三类为例,
    这里我们可以通过多次尝试,调节传入模型的minPoints和eps值来使得DBSCAN聚类结果也为三类。
    且我们假定真实的类别从1开始计数
    DBSCAN模型的聚类类别从0开始计数
    """

    def __init__(self,eps=10,minPoints=1):
        self.eps = eps
        self.minPoints = minPoints
        self.labels = []
        self.center_point = []
        self.edge_point = []
        self.outlier_point = []
        self.datas = []
        self.MicroF1 = 0.0
        self.MacroF1 = 0.0
        self.S_coef = 0.0

    def getdistance(self,x1, x2):
        dis = math.sqrt(np.power(x1 - x2, 2).sum())
        return dis

    def get_ai(self,i):
        sumdistance = 0
        count = 0
        for j in range(len(self.datas)):
            if self.labels[j] == self.labels[i] and j != i :
                sumdistance = sumdistance + self.getdistance(self.datas[i,:],self.datas[j,:])
                count = count + 1
        return sumdistance*1.0/count

    def get_bi(self,i,newdata):
        sumdis = 0
        for j in range(len(newdata)):
            sumdis = sumdis + self.getdistance(self.datas[i,:],newdata[j])
        return sumdis*1.0/len(newdata)

    def getcenter(self,data):
        res = []
        data = np.array(data)
        for i in range(len(data[0])):
            res.append(np.mean(data[:,i]))
        return res

    def getnearpoints(self,i, data, r):
        res = []
        for j in range(len(data)):
            # if j != i :
            if self.getdistance(data[j,:],data[i,:]) <= r:
                res.append(j)
        return res

    # 训练模型
    def fit(self,datas):
        self.datas = datas
        minPoints = self.minPoints
        r = self.eps
        # 初始化核心对象集合
        center_point = []
        for i in range(len(datas)):
            # 确定i的邻域样本
            nearpoints = self.getnearpoints(i, datas, r)
            if len(nearpoints) >= minPoints:
                # 将样本加入到核心对象
                center_point.append(i)
            # end if
        self.center_point = center_point
        # end for
        # 初始化聚类簇数
        k = 0
        C = []
        # 初始化未访问集合
        unvisited = []
        for i in range(len(datas)):
            unvisited.append(i)
        while len(center_point):
            # 记录当前未访问的样本集合
            old_unvisited = unvisited
            # 随机选取一个核心对象,初始化队列
            p = center_point[0]
            Q = []
            Q.append(p)
            # 更新未访问集合
            newunvisited = []
            for i in range(len(unvisited)):
                if unvisited[i] != p:
                    newunvisited.append(unvisited[i])
            unvisited = newunvisited

            # whileQ不等于空集
            while len(Q):
                # 取出队列中的首样本q
                q = Q[0]
                Q.remove(q)
                near_q = self.getnearpoints(q, datas, r)
                if len(near_q) >= minPoints:
                    delta = []
                    # 求出delta
                    for i in range(len(near_q)):
                        if near_q[i] in unvisited:
                            delta.append(near_q[i])
                    # delta的样本加入到Q
                    # unvisited中去除掉delta中的元素
                    for i in range(len(delta)):
                        Q.append(delta[i])
                        if delta[i] in unvisited:
                            unvisited.remove(delta[i])
                # end if
            # end while
            k = k + 1
            # 生成聚类簇Ck
            Ckl = []
            for i in range(len(old_unvisited)):
                if old_unvisited[i] not in unvisited:
                    Ckl.append(old_unvisited[i])
            C.append(Ckl)
            # 更新核心对象
            newcenter_point = []
            for i in range(len(center_point)):
                if center_point[i] not in Ckl:
                    newcenter_point.append(center_point[i])
            center_point = newcenter_point
        # end while
        # print(C)
        labels = []
        for i in range(len(datas)):
            flag = 0
            for j in range(len(C)):
                if i in C[j]:
                    flag = 1
                    labels.append(j)
                    break
            if flag == 0:
                labels.append(-1)
                self.outlier_point.append(i)
        self.labels = labels
        for i in range(len(datas)):
            if i not in self.outlier_point and i not in self.center_point:
                self.edge_point.append(i)
    def cal_MicroF1(self,truelabels):
        # 统计类别数
        classes = []
        for i in range(len(truelabels)):
            if truelabels[i] not in classes:
                classes.append(truelabels[i])
        # for i in range(len(self.labels)):
        #     if self.labels[i] not in classes:
        #         classes.append(self.labels[i])

        # 统计总体的P和R
        TP = 0
        FP = 0
        FN = 0
        TN = 0
        for i in range(len(classes)):
            for j in range(len(truelabels)):
                if truelabels[j] == classes[i] and self.labels[j] == classes[i]:
                    TP = TP + 1
                if truelabels[j] != classes[i] and self.labels[j] == classes[i]:
                    FP = FP + 1
                if truelabels[j] != classes[i] and self.labels[j] != classes[i]:
                    TN = TN + 1
                if truelabels[j] == classes[i] and self.labels[j] != classes[i]:
                    FN = FN + 1
        P = TP*1.0/(TP+FP)
        R = TP*1.0/(TP+FN)
        self.MicroF1 =  2.0*P*R/(P+R)
    def cal_MacroF1(self,truelabels):
        # 统计类别数
        classes = []
        for i in range(len(truelabels)):
            if truelabels[i] not in classes:
                classes.append(truelabels[i])
        # for i in range(len(self.labels)):
        #     if self.labels[i] not in classes:
        #         classes.append(self.labels[i])
        P = []
        R = []
        # 统计每个类别的P和R
        for i in range(len(classes)):
            TP = 0
            FP = 0
            FN = 0
            TN = 0
            for j in range(len(truelabels)):
                if truelabels[j] == classes[i] and self.labels[j] == classes[i]:
                    TP = TP + 1
                if truelabels[j] != classes[i] and self.labels[j] == classes[i]:
                    FP = FP + 1
                if truelabels[j] != classes[i] and self.labels[j] != classes[i]:
                    TN = TN + 1
                if truelabels[j] == classes[i] and self.labels[j] != classes[i]:
                    FN = FN + 1
            single_P = TP * 1.0 / (TP + FP)
            single_R = TP * 1.0 / (TP + FN)
            P.append(single_P)
            R.append(single_R)
        F = []
        for i in range(len(P)):
            if R[i]+P[i]>0 :
                F.append(2.0*P[i]*R[i]/(R[i]+P[i]))
            else:F.append(0)
        self.MacroF1 =  np.mean(np.array(F))
    def cal_S_coef(self,labels):
        self.labels = labels
        sumS = []
        # 统计类别数
        classes = []
        for i in range(len(self.labels)):
            if self.labels[i] not in classes:
                classes.append(self.labels[i])
        #把全部数据分类
        newdata = []
        for i in range(len(classes)):
            newdata.append([])
        for i in range(len(self.datas)):
            newdata[self.labels[i]].append(self.datas[i])
        #遍历每一个点
        for i in range(len(self.datas)):
            ai = self.get_ai(i)
            # 计算bi,先算i到其他类的中心的距离
            center = []
            for j in range(len(classes)):
                if self.labels[i]!=classes[j]:
                    center.append(self.getcenter(newdata[classes[j]]))
            center_distance = []
            for j in range(len(center)):
                center_distance.append(self.getdistance(self.datas[i,:],center[j]))
            # 找到距离最近的那一个其他簇
            mindistance_index = center_distance.index(min(center_distance))
            bi = self.get_bi(i,newdata[mindistance_index])
            Si = 1.0*(bi-ai)/(max(ai,bi))
            sumS.append(Si)
        self.S_coef = sum(sumS)*1.0/len(self.datas)

def main():
    df = pd.read_csv('D:\桌面\wine.csv')

    # 获取标签
    labels_true = df.iloc[:,0].values
    # 获取数据
    datas = df.iloc[:,1:14].values

    newlabels_true = labels_true-1

    # # r = 50
    # l = []
    # ll = range(1,101,10)
    # for i in range(len(ll)):
    #     l.append(ll[i])
    # # print(l)
    # for n in range(len(l)):
        # r = l[n]
    r = 61
    minPoints = 10
    print("eps:")
    print(r)
    print("minPoints:")
    print(minPoints)
    model = ZWDBSCAN(r, minPoints)
    model.fit(datas)
    print(model.labels)
    print("调库结果:")
    db = DBSCAN(eps=r, min_samples=minPoints).fit(datas)
    print(db.labels_)
    print("两种结果差异值(不相同的值)个数:")
    t = 0
    for i in range(len(db.labels_)):
        if (model.labels[i] != db.labels_[i]):
            t = t + 1
    print(t)
    model.cal_MacroF1(newlabels_true)
    model.cal_MicroF1(newlabels_true)
    model.cal_S_coef(model.labels)
    # print(sm.f1_score(newlabels_true, model.labels, labels=[0, 1, 2], average='micro'))
    # print(sm.f1_score(newlabels_true, model.labels, labels=[0, 1, 2], average='macro'))
    print(model.MicroF1)
    print(model.MacroF1)
    print(model.S_coef)
if __name__ == "__main__":
    main()

运行结果如图所示:
在这里插入图片描述

后记

以后笔者再复现诸如此类的算法,绝不会再看网上的博客了,还是参考西瓜书吧,这是真的权威!!!

  • 4
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值