【一般聚类/时序聚类】python实现多密度自适应聚类:Multi-DBSCAN

本文代码基于该篇进行魔改,功能调用更加方便,速度经测试快了几十倍

import math
import copy
import numpy as np
from sklearn.cluster import DBSCAN
import sklearn.metrics.pairwise as pairwise


class Adapter_DBSCAN():

    # 默认统计聚类个数在2-25之间的聚类情况, 参数符合python左闭右开
    def __init__(self,num_cluster_range=(2,26)):
        self.num_cluster_range = num_cluster_range
    

    def returnEpsCandidate(self,dataSet):
        """
        :param dataSet: 数据集
        :return: eps候选集合
        """
        #self.DistMatrix = self.CalculateDistMatrix(dataSet)
        self.DistMatrix = pairwise.euclidean_distances(dataSet)
        tmp_matrix = copy.deepcopy(self.DistMatrix)
        for i in range(len(tmp_matrix)):
            tmp_matrix[i].sort()
        EpsCandidate = []
        for k in range(1,len(dataSet)):
            #Dk = self.returnDk(tmp_matrix,k)
            Dk = tmp_matrix[:,k]
            # DkAverage = self.returnDkAverage(Dk)
            # 快160+倍
            DkAverage = np.mean(Dk)
            EpsCandidate.append(DkAverage)
        return EpsCandidate
    

    def returnMinptsCandidate(self,DistMatrix,EpsCandidate,X):
        """
        :param DistMatrix: 距离矩阵
        :param EpsCandidate: Eps候选列表
        :return: Minpts候选列表
        """
        MinptsCandidate = []
        for k in range(len(EpsCandidate)):
            tmp_eps = EpsCandidate[k]
            tmp_count = 0
            # for i in range(len(DistMatrix)):
            #     for j in range(len(DistMatrix[i])):
            #         if DistMatrix[i][j] <= tmp_eps:
            #             tmp_count = tmp_count + 1
            # 快250+倍
            tmp_count = np.sum(DistMatrix <= tmp_eps)
            MinptsCandidate.append(tmp_count/len(X))
        return MinptsCandidate
    

    def fit(self,X):
        self.EpsCandidate = self.returnEpsCandidate(X)
        self.MinptsCandidate = self.returnMinptsCandidate(self.DistMatrix,self.EpsCandidate,X)
        self.do_multi_dbscan(X)
        self.set_num_clusters_range(self.num_cluster_range)


    def do_multi_dbscan(self,X):
        self.all_predict_dict = {}
        self.all_param_dict = {}

        for i in range(len(self.EpsCandidate)):
            eps = self.EpsCandidate[i]
            minpts = self.MinptsCandidate[i]
            db = DBSCAN(eps=eps,min_samples=minpts).fit(X)
            num_clusters = max(db.labels_) + 1
            # 统计符合范围的聚类情况

            if num_clusters not in self.all_predict_dict.keys():
                self.all_predict_dict[num_clusters] = []
                self.all_param_dict[num_clusters] = []

            self.all_predict_dict[num_clusters].append(db.labels_)
            self.all_param_dict[num_clusters].append({"eps":eps,"minpts":minpts})


    # 筛选聚类个数,比如Multi-DBSCAN共产生了3聚类、15聚类、136聚类三种情况
    # 我想只看10~20的聚类情况,就可以设置set_num_clusters_range(10~21)后调用get_predict_dict()
    def set_num_clusters_range(self,num_cluster_range:tuple):
        self.predict_dict = {}
        self.param_dict = {}
        # 统计符合范围的聚类情况

        for num_cluster, labels, params in zip(self.all_predict_dict.keys(),\
            self.all_predict_dict.values(), self.all_param_dict.values()):
            if num_cluster >= num_cluster_range[0] and \
                num_cluster < num_cluster_range[1]:
                self.predict_dict[num_cluster] = labels
                self.param_dict[num_cluster] = params


    # 获取当前Multi-DBSCAN的聚类预测信息,格式为{聚类个数:[[预测可能1],[预测可能2],...]}
    # 比如聚类个数为3的情况可能有多种,所以有预测可能1、预测可能2...
    def get_predict_dict(self):
        if self.predict_dict is None:
            raise RuntimeError("get_predict_dict before fit")
        return self.predict_dict


    # 获取当前Multi-DBSCAN的聚类参数信息,格式为{聚类个数:[{"eps":x1,"minpts":y1},{"eps":x2,"minpts":y2},...]}
    def get_info_dict(self):
        if self.param_dict is None:
            raise RuntimeError("get_info_dict before fit")
        return self.param_dict

实验用到的数据集格式,共788行:

测试:

def loadDataSet(fileName):
    """
    输入:文件名
    输出:数据集
    描述:从文件读入数据集
    """
    dataSet = []
    with open(fileName) as fr:
        for line in fr.readlines():
            curline = line.strip().split(",")
            fltline = list(map(float, curline))
            dataSet.append(fltline)
    return np.array(dataSet)


if __name__ == '__main__':

    X = loadDataSet('common/788points.txt')
    DB = Adapter_DBSCAN()
    DB.fit(X)
    
    # 输出15聚类的情况
    DB.set_num_clusters_range((15,16))
    # label预测信息
    predict_dict = DB.get_predict_dict()
    # 参数信息
    info_dict = DB.get_info_dict()

    print(predict_dict)
    print("================================")
    print(info_dict)

# 注意到整个Multi-DBSCAN过程中eps、minpts参数都不需要你手工设置
# 还可以控制输出聚类个数范围,是不是很方便?

打印:

{15: [array([-1, -1, -1,  0,  0,  0,  0,  0, -1, -1,  0,  0,  0, -1, -1, -1,  0,
        0,  0,  0,  0,  0, -1,  0,  0, -1, -1, -1, -1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  3,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1,  1,  2, -1,  1, -1, -1, -1,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2, -1,  4,  4,  2,  2,  2,  2,  3,  3,  3,  4,  4,
        4,  4,  5,  5,  5,  5, -1,  6,  3,  6,  6,  6,  6,  6,  6, -1,  7,
        7,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3, -1, -1,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3, -1, -1, -1,  3,  3,  3,  6,  7,  7,
        3, -1, -1, -1,  7,  7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        8,  8,  8, -1,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
       -1, -1,  9,  9, -1,  9,  9,  9,  9,  9,  9,  9,  9, -1,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, -1,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, -1,
        9,  9,  9,  9,  9,  9, -1,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, -1, -1,  9,  9,
        9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9, -1,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9, 10, 10, 10, 10, 10, -1, 10, -1, -1, -1, 11, -1, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1, 10, -1, 10, 10, 11,
       11, 11, -1, -1, 11, 11, 11, 11, 11, -1, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, -1, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, -1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1,
       -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1, 13, 13, 13, 13,
       13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
       -1, 13, 13, 13, 13, 13, 13, 13, 13, -1, 13, 13, 13, 13, 13, 13, -1,
       13, -1, -1, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14]), array([-1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1, -1,  0,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1, -1, -1, -1,  1,  1,  1,
        1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1, -1, -1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  2,  2,  1,  1,  1,  1,  1,  1,
        1,  2,  2, -1, -1, -1, -1,  3,  2,  2,  3,  3,  3,  3,  3,  3,  3,
        3, -1, -1, -1, -1, -1, -1, -1,  3, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  3,  3,  3,  3,  3,  3,  3,  3,  3, -1, -1, -1, -1, -1, -1, -1,
       -1,  3,  3,  3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
        4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,
       -1, -1, -1, -1, -1, -1,  5, -1,  5,  5,  5,  5,  5,  5, -1, -1,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, -1,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, -1,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5, -1,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
       -1,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,
        5,  5,  5,  5, -1,  5, -1,  5,  5,  5, -1,  5,  5,  5,  5,  5,  5,
        5,  6,  6, -1,  6, -1, -1, -1, -1, -1,  7,  7, -1, -1,  6,  6,  6,
        6,  6,  6,  6,  6,  6,  6, -1, -1,  6,  6, -1, -1, -1, -1,  6,  8,
        8,  6, -1, -1, -1,  7,  7,  7,  7,  7,  7,  7,  9, -1,  8,  8,  8,
        8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  9,  9,
       -1,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,
        9,  9,  9,  9,  8,  8,  8,  8,  8,  8, -1, -1, -1, -1,  9,  9,  9,
        9, -1, -1, -1, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 11, 10, 10, 11, 11, 11, 11, 11, 11, -1,
       11, 12, 12, 12, 12, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1, 10, 10, 10,
       10, 10, 10, 10, 10, 10, -1, 10, 10, 10, 10, 10, 10, 10, 12, 10, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1, -1, -1, -1, -1,
       12, 12, 10, 10, -1, -1, -1, -1, -1, -1, -1, 10, 13, 13, 13, 13, 13,
       13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, -1, 13, 13, -1,
       -1, 13, 13, 13, 13, 13, 13, 13, 13, -1, 13, 13, 13, 13, 13, 13, -1,
       13, 13, -1, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 14])]}
================================
{15: [{'eps': 1.0547224554276167, 'minpts': 5.916243654822335}, {'eps': 1.238569836383374, 'minpts': 8.055837563451776}]}

上述结果显示了Multi-DBSCAN在15聚类情况下的2种可能

时序聚类测试

再测试时序聚类,选取东海、渤海、黄海等海面温等数据,样本比较少,60多个,每个样本单要素,2800+时间点(已降维成100),形成shape=(60+,100)的数据

import time
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics.pairwise as pairwise
from sklearn.cluster import DBSCAN
from common.my_clustering import Adapter_DBSCAN
from sklearn.metrics.cluster import \
    silhouette_score as sklearn_silhouette_score


# 自动根据聚类个数布局行列
def multip_decomposition(num_cluster):
    s = math.sqrt(num_cluster)
    col = math.ceil(s)
    row = int(s)
    flag = False
    while(col * row < num_cluster):
        if flag:
            row += 1
            flag = True
        else:
            col += 1
            flag = False
    return row, col


def test_multi_dbscan(config):
    X = np.loadtxt(config["data_path"],dtype=np.float32)
    adb = Adapter_DBSCAN(num_cluster_range=(2,30))
 
    t0 = time.time()
    adb.fit(X)
    t1 = time.time()

    predict_dict = adb.get_predict_dict()
    info_dict = adb.get_info_dict()

    for num_cluster, labels, infos in zip(predict_dict.keys(),predict_dict.values(),info_dict.values()):
        for possible,y_pred in enumerate(labels):
            # 时序图横着放更好看,所以是col,row而不是row,col
            col,row = multip_decomposition(num_cluster + 2)
            y_pred_valid = []
            X_valid = []
            for yi in range(num_cluster):
                plt.subplot(row, col, yi + 1)
                clusters_yi = X[y_pred == yi]
                for xs in clusters_yi:
                    X_valid.append(xs.tolist())
                    y_pred_valid.append(yi)
                for xx in clusters_yi:
                    plt.plot(xx.ravel(), "k-", alpha=.3)
                plt.plot(np.mean(clusters_yi,axis=0).ravel(), "r-")
            
            X_valid = np.array(X_valid)
            y_pred_valid = np.array(y_pred_valid)
            print(X_valid.shape)
            print(y_pred_valid.shape)

            plt.subplot(row, col, num_cluster + 1)
            clusters_noise = X[y_pred == -1]
            for xx in clusters_noise:
                    plt.plot(xx.ravel(), "k-", alpha=.3)
            plt.plot(np.mean(clusters_noise,axis=0).ravel(), "r-")
            plt.text(0.55, 0.85,'noise',
                        transform=plt.gca().transAxes)

            plt.subplot(row, col, num_cluster + 2)
            clusters_uncertain = X[y_pred == -2]
            for xx in clusters_uncertain:
                    plt.plot(xx.ravel(), "k-", alpha=.3)
            plt.plot(np.mean(clusters_uncertain,axis=0).ravel(), "r-")
            plt.text(0.55, 0.85,'uncertain',
                        transform=plt.gca().transAxes)

            plt.tight_layout()

            # prefix, suffix = filepath.split(".")
            # prefix = data_process.exp_replace(prefix,"exp4") 
            # img_save_path = "{}_{}_{}minpts_{}clusters_possible{}.png".format(\
            #   prefix,config["test_name"],"%.2f" % infos[possible]["minpts"],num_cluster,possible)
            # logger.log_img("聚类结果图: {}".format(img_save_path),img_save_path)
            # 自己找地方保存plt的图像

            # 轮廓系数
            t2 = time.time()
            dists = pairwise.pairwise_distances(X_valid,metric=config["cdists_method"],n_jobs=-1)
            score = sklearn_silhouette_score(dists,y_pred_valid,metric=config["method"])
            t3 = time.time()
            # logger.log("silhouette_score="+str(score)+", time cost: "+str(t3 - t2)+"s\n\n")
            # 自己找地方保存轮廓系数

# 测试
filepath=...
config = {"method":"precomputed","cdists_method":"l2", "test_name":"multidbscan", "data_path":filepath,"seed":0}
test_multi_dbscan(config)

由于该部分数据海域选取的地理点较近,且样本较少,较杂且较难区分

实验结果:

实验日志显示除noise外的数据仍有较好的轮廓系数0.5+,说明聚类结果算理想(相比于KMeans系列轮廓系数只有0.2+):

 

  • 7
    点赞
  • 93
    收藏
    觉得还不错? 一键收藏
  • 53
    评论
评论 53
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值