利用DBSCAN算法对双半月数据集进行聚类

最新推荐文章于 2023-04-11 21:26:46 发布

柳叶吴钩

最新推荐文章于 2023-04-11 21:26:46 发布

阅读量1.5k

点赞数 2

分类专栏：神经网络与机器学习笔记文章标签：聚类算法

本文链接：https://blog.csdn.net/moge19/article/details/89322780

版权

神经网络与机器学习笔记专栏收录该内容

40 篇文章 11 订阅

订阅专栏

1、生成数据集

class moon_data_class(object):
    def __init__(self,N,d,r,w):
        self.N=N
        self.w=w
      
        self.d=d
        self.r=r
    
   
    def sgn(self,x):
        if(x>0):
            return 1;
        else:
            return -1;
        
    def sig(self,x):
        return 1.0/(1+np.exp(x))
    
        
    def dbmoon(self):
        N1 = 10*self.N
        N = self.N
        r = self.r
        w2 = self.w/2
        d = self.d
        done = True
        data = np.empty(0)
        while done:
            #generate Rectangular data
            tmp_x = 2*(r+w2)*(np.random.random([N1, 1])-0.5)
            tmp_y = (r+w2)*np.random.random([N1, 1])
            tmp = np.concatenate((tmp_x, tmp_y), axis=1)
            tmp_ds = np.sqrt(tmp_x*tmp_x + tmp_y*tmp_y)
            #generate double moon data ---upper
            idx = np.logical_and(tmp_ds > (r-w2), tmp_ds < (r+w2))
            idx = (idx.nonzero())[0]
     
            if data.shape[0] == 0:
                data = tmp.take(idx, axis=0)
            else:
                data = np.concatenate((data, tmp.take(idx, axis=0)), axis=0)
            if data.shape[0] >= N:
                done = False
        #print (data)
        db_moon = data[0:N, :]
        #print (db_moon)
        #generate double moon data ----down
        data_t = np.empty([N, 2])
        data_t[:, 0] = data[0:N, 0] + r
        data_t[:, 1] = -data[0:N, 1] - d
        db_moon = np.concatenate((db_moon, data_t), axis=0)
        return db_moon

2、计算最佳半径

def epsilon(data,Minpts):
    m,n = np.shape(data)
    xMax = np.max(data)
    xMin = np.min(data)
    eps = ((np.prod(xMax-xMin) * Minpts * math.gamma(0.5*n+1))/(m*math.sqrt(math.pi **n)))**(1.0/n)
    return eps

3、DBSCAN算法


def distance(data):
    m, n = np.shape(data)
    dis = np.mat(np.zeros((m, m)))
    for i in range(m):
        for j in range(i, m):
            # 计算i和j之间的欧式距离
            tmp = 0
            for k in range(n):
                tmp += (data[i, k] - data[j, k]) * (data[i, k] - data[j, k])
            dis[i, j] = np.sqrt(tmp)
            dis[j, i] = dis[i, j]
    return dis

def find_eps(distance_D, eps):
    ind = []
    n = np.shape(distance_D)[1]
    for j in range(n):
        if distance_D[0, j] <= eps:
            ind.append(j)
    return ind

def dbscan(data, eps, MinPts):
    m = np.shape(data)[0]
    # 区分核心点1，边界点0和噪音点-1
    types = np.mat(np.zeros((1, m)))
    sub_class = np.mat(np.zeros((1, m)))
    # 用于判断该点是否处理过，0表示未处理过
    dealed = np.mat(np.zeros((m, 1)))
    # 计算每个数据点之间的距离
    dis = distance(data)
    # 用于标记类别
    number = 1
    
    # 对每一个点进行处理
    for i in range(m):
        # 找到未处理的点
        if dealed[i, 0] == 0:
            # 找到第i个点到其他所有点的距离
            D = dis[i, ]
            # 找到半径eps内的所有点
            ind = find_eps(D, eps)
            # 区分点的类型
            # 边界点
            if len(ind) > 1 and len(ind) < MinPts + 1:
                types[0, i] = 0
                sub_class[0, i] = 0
            # 噪音点
            if len(ind) == 1:
                types[0, i] = -1
                sub_class[0, i] = -1
                dealed[i, 0] = 1
            # 核心点
            if len(ind) >= MinPts + 1:
                types[0, i] = 1
                for x in ind:
                    sub_class[0, x] = number
                # 判断核心点是否密度可达
                while len(ind) > 0:
                    dealed[ind[0], 0] = 1
                    D = dis[ind[0], ]
                    tmp = ind[0]
                    del ind[0]
                    ind_1 = find_eps(D, eps)
                    
                    if len(ind_1) > 1:  # 处理非噪音点
                        for x1 in ind_1:
                            sub_class[0, x1] = number
                        if len(ind_1) >= MinPts + 1:
                            types[0, tmp] = 1
                        else:
                            types[0, tmp] = 0
                            
                        for j in range(len(ind_1)):
                            if dealed[ind_1[j], 0] == 0:
                                dealed[ind_1[j], 0] = 1
                                ind.append(ind_1[j])
                                sub_class[0, ind_1[j]] = number
                number += 1
    
    # 最后处理所有未分类的点为噪音点
    ind_2 = ((sub_class == 0).nonzero())[1]
    for x in ind_2:
        sub_class[0, x] = -1
        types[0, x] = -1
        
    return types, sub_class

4、运行结果

原图
在这里插入图片描述聚类结果

5 完整代码

# -*- coding: utf-8 -*-
"""
Created on Mon Apr 15 21:33:41 2019

@author: ASUS
"""
import numpy as np
import pylab as pl
import random as rd
import math
import random
import matplotlib.pyplot as plt
import numpy as np
import mpl_toolkits.mplot3d
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from scipy import *
from scipy.linalg import norm, pinv
 
from matplotlib import pyplot as plt

class moon_data_class(object):
    def __init__(self,N,d,r,w):
        self.N=N
        self.w=w
      
        self.d=d
        self.r=r
    
   
    def sgn(self,x):
        if(x>0):
            return 1;
        else:
            return -1;
        
    def sig(self,x):
        return 1.0/(1+np.exp(x))
    
        
    def dbmoon(self):
        N1 = 10*self.N
        N = self.N
        r = self.r
        w2 = self.w/2
        d = self.d
        done = True
        data = np.empty(0)
        while done:
            #generate Rectangular data
            tmp_x = 2*(r+w2)*(np.random.random([N1, 1])-0.5)
            tmp_y = (r+w2)*np.random.random([N1, 1])
            tmp = np.concatenate((tmp_x, tmp_y), axis=1)
            tmp_ds = np.sqrt(tmp_x*tmp_x + tmp_y*tmp_y)
            #generate double moon data ---upper
            idx = np.logical_and(tmp_ds > (r-w2), tmp_ds < (r+w2))
            idx = (idx.nonzero())[0]
     
            if data.shape[0] == 0:
                data = tmp.take(idx, axis=0)
            else:
                data = np.concatenate((data, tmp.take(idx, axis=0)), axis=0)
            if data.shape[0] >= N:
                done = False
        #print (data)
        db_moon = data[0:N, :]
        #print (db_moon)
        #generate double moon data ----down
        data_t = np.empty([N, 2])
        data_t[:, 0] = data[0:N, 0] + r
        data_t[:, 1] = -data[0:N, 1] - d
        db_moon = np.concatenate((db_moon, data_t), axis=0)
        return db_moon

def epsilon(data,Minpts):
    m,n = np.shape(data)
    xMax = np.max(data)
    xMin = np.min(data)
    eps = ((np.prod(xMax-xMin) * Minpts * math.gamma(0.5*n+1))/(m*math.sqrt(math.pi **n)))**(1.0/n)
    return eps

def distance(data):
    m, n = np.shape(data)
    dis = np.mat(np.zeros((m, m)))
    for i in range(m):
        for j in range(i, m):
            # 计算i和j之间的欧式距离
            tmp = 0
            for k in range(n):
                tmp += (data[i, k] - data[j, k]) * (data[i, k] - data[j, k])
            dis[i, j] = np.sqrt(tmp)
            dis[j, i] = dis[i, j]
    return dis

def find_eps(distance_D, eps):
    ind = []
    n = np.shape(distance_D)[1]
    for j in range(n):
        if distance_D[0, j] <= eps:
            ind.append(j)
    return ind

def dbscan(data, eps, MinPts):
    m = np.shape(data)[0]
    # 区分核心点1，边界点0和噪音点-1
    types = np.mat(np.zeros((1, m)))
    sub_class = np.mat(np.zeros((1, m)))
    # 用于判断该点是否处理过，0表示未处理过
    dealed = np.mat(np.zeros((m, 1)))
    # 计算每个数据点之间的距离
    dis = distance(data)
    # 用于标记类别
    number = 1
    
    # 对每一个点进行处理
    for i in range(m):
        # 找到未处理的点
        if dealed[i, 0] == 0:
            # 找到第i个点到其他所有点的距离
            D = dis[i, ]
            # 找到半径eps内的所有点
            ind = find_eps(D, eps)
            # 区分点的类型
            # 边界点
            if len(ind) > 1 and len(ind) < MinPts + 1:
                types[0, i] = 0
                sub_class[0, i] = 0
            # 噪音点
            if len(ind) == 1:
                types[0, i] = -1
                sub_class[0, i] = -1
                dealed[i, 0] = 1
            # 核心点
            if len(ind) >= MinPts + 1:
                types[0, i] = 1
                for x in ind:
                    sub_class[0, x] = number
                # 判断核心点是否密度可达
                while len(ind) > 0:
                    dealed[ind[0], 0] = 1
                    D = dis[ind[0], ]
                    tmp = ind[0]
                    del ind[0]
                    ind_1 = find_eps(D, eps)
                    
                    if len(ind_1) > 1:  # 处理非噪音点
                        for x1 in ind_1:
                            sub_class[0, x1] = number
                        if len(ind_1) >= MinPts + 1:
                            types[0, tmp] = 1
                        else:
                            types[0, tmp] = 0
                            
                        for j in range(len(ind_1)):
                            if dealed[ind_1[j], 0] == 0:
                                dealed[ind_1[j], 0] = 1
                                ind.append(ind_1[j])
                                sub_class[0, ind_1[j]] = number
                number += 1
    
    # 最后处理所有未分类的点为噪音点
    ind_2 = ((sub_class == 0).nonzero())[1]
    for x in ind_2:
        sub_class[0, x] = -1
        types[0, x] = -1
        
    return types, sub_class

if __name__ == "__main__":   
    step=0
    color=['.r','.g','.b','.y']#颜色种类
    dcolor=['*r','*g','*b','*y','or','og','ob','oy']#颜色种类
    frames = []
    
    N = 100
    d = -5
    r = 10
    width = 1
     
    MinPts= 50
        
    data_source = moon_data_class(N, d, r, width)
    data = data_source.dbmoon()
     # x0 = [1 for x in range(1,401)]
    input_cells = np.array([np.reshape(data[0:2*N, 0], len(data)), np.reshape(data[0:2*N, 1], len(data))]).transpose()
        
    labels_pre = [[-1.] for y in range(1, 401)]
    labels_pos = [[1. ] for y in range(1, 401)]
    label=labels_pre+labels_pos
    dataSet = np.mat(input_cells)
    labels = np.mat(label)
    
    eps = epsilon(data,MinPts)
    types, sub_class = dbscan(data, eps, MinPts)
    
    plt.plot(data[0:N, 0], data[0:N, 1], 'r*', data[N:2*N, 0], data[N:2*N, 1], 'r*')
    plt.show() 
    sub_class = np.array(sub_class)
    for i in range(200):
        color_type = dcolor[int(sub_class[0][i])%8]
        plt.plot(data[i, 0], data[i, 1],color_type)
         
#         elif types[0][i]==2:
#             plt.plot(data[i, 0], data[i, 1],'r*')
#         elif types[0][i]==3:
#             plt.plot(data[i, 0], data[i, 1],'b*')
#         elif types[0][i]==4:
#             plt.plot(data[i, 0], data[i, 1],'g*')
    plt.show()

柳叶吴钩

关注

2
点赞
踩
10

收藏

觉得还不错? 一键收藏
2
评论
利用DBSCAN算法对双半月数据集进行聚类

1、生成数据集class moon_data_class(object): def __init__(self,N,d,r,w): self.N=N self.w=w self.d=d self.r=r def sgn(self,x): if(x>0): ...
复制链接

扫一扫