本文代码基于该篇进行魔改,功能调用更加方便,速度经测试快了几十倍
import math
import copy
import numpy as np
from sklearn.cluster import DBSCAN
import sklearn.metrics.pairwise as pairwise
class Adapter_DBSCAN():
# 默认统计聚类个数在2-25之间的聚类情况, 参数符合python左闭右开
def __init__(self,num_cluster_range=(2,26)):
self.num_cluster_range = num_cluster_range
def returnEpsCandidate(self,dataSet):
"""
:param dataSet: 数据集
:return: eps候选集合
"""
#self.DistMatrix = self.CalculateDistMatrix(dataSet)
self.DistMatrix = pairwise.euclidean_distances(dataSet)
tmp_matrix = copy.deepcopy(self.DistMatrix)
for i in range(len(tmp_matrix)):
tmp_matrix[i].sort()
EpsCandidate = []
for k in range(1,len(dataSet)):
#Dk = self.returnDk(tmp_matrix,k)
Dk = tmp_matrix[:,k]
# DkAverage = self.returnDkAverage(Dk)
# 快160+倍
DkAverage = np.mean(Dk)
EpsCandidate.append(DkAverage)
return EpsCandidate
def returnMinptsCandidate(self,DistMatrix,EpsCandidate,X):
"""
:param DistMatrix: 距离矩阵
:param EpsCandidate: Eps候选列表
:return: Minpts候选列表
"""
MinptsCandidate = []
for k in range(len(EpsCandidate)):
tmp_eps = EpsCandidate[k]
tmp_count = 0
# for i in range(len(DistMatrix)):
# for j in range(len(DistMatrix[i])):
# if DistMatrix[i][j] <= tmp_eps:
# tmp_count = tmp_count + 1
# 快250+倍
tmp_count = np.sum(DistMatrix <= tmp_eps)
MinptsCandidate.append(tmp_count/len(X))
return MinptsCandidate
def fit(self,X):
self.EpsCandidate = self.returnEpsCandidate(X)
self.MinptsCandidate = self.returnMinptsCandidate(self.DistMatrix,self.EpsCandidate,X)
self.do_multi_dbscan(X)
self.set_num_clusters_range(self.num_cluster_range)
def do_multi_dbscan(self,X):
self.all_predict_dict = {}
self.all_param_dict = {}
for i in range(len(self.EpsCandidate)):
eps = self.EpsCandidate[i]
minpts = self.MinptsCandidate[i]
db = DBSCAN(eps=eps,min_samples=minpts).fit(X)
num_clusters = max(db.labels_) + 1
# 统计符合范围的聚类情况
if num_clusters not in self.all_predict_dict.keys():
self.all_predict_dict[num_clusters] = []
self.all_param_dict[num_clusters] = []
self.all_predict_dict[num_clusters].append(db.labels_)
self.all_param_dict[num_clusters].append({"eps":eps,"minpts":minpts})
# 筛选聚类个数,比如Multi-DBSCAN共产生了3聚类、15聚类、136聚类三种情况
# 我想只看10~20的聚类情况,就可以设置set_num_clusters_range(10~21)后调用get_predict_dict()
def set_num_clusters_range(self,num_cluster_range:tuple):
self.predict_dict = {}
self.param_dict = {}
# 统计符合范围的聚类情况
for num_cluster, labels, params in zip(self.all_predict_dict.keys(),\
self.all_predict_dict.values(), self.all_param_dict.values()):
if num_cluster >= num_cluster_range[0] and \
num_cluster < num_cluster_range[1]:
self.predict_dict[num_cluster] = labels
self.param_dict[num_cluster] = params
# 获取当前Multi-DBSCAN的聚类预测信息,格式为{聚类个数:[[预测可能1],[预测可能2],...]}
# 比如聚类个数为3的情况可能有多种,所以有预测可能1、预测可能2...
def get_predict_dict(self):
if self.predict_dict is None:
raise RuntimeError("get_predict_dict before fit")
return self.predict_dict
# 获取当前Multi-DBSCAN的聚类参数信息,格式为{聚类个数:[{"eps":x1,"minpts":y1},{"eps":x2,"minpts":y2},...]}
def get_info_dict(self):
if self.param_dict is None:
raise RuntimeError("get_info_dict before fit")
return self.param_dict
实验用到的数据集格式,共788行:
测试:
def loadDataSet(fileName):
"""
输入:文件名
输出:数据集
描述:从文件读入数据集
"""
dataSet = []
with open(fileName) as fr:
for line in fr.readlines():
curline = line.strip().split(",")
fltline = list(map(float, curline))
dataSet.append(fltline)
return np.array(dataSet)
if __name__ == '__main__':
X = loadDataSet('common/788points.txt')
DB = Adapter_DBSCAN()
DB.fit(X)
# 输出15聚类的情况
DB.set_num_clusters_range((15,16))
# label预测信息
predict_dict = DB.get_predict_dict()
# 参数信息
info_dict = DB.get_info_dict()
print(predict_dict)
print("================================")
print(info_dict)
# 注意到整个Multi-DBSCAN过程中eps、minpts参数都不需要你手工设置
# 还可以控制输出聚类个数范围,是不是很方便?
打印:
{15: [array([-1, -1, -1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, -1, -1, -1, 0,
0, 0, 0, 0, 0, -1, 0, 0, -1, -1, -1, -1, -1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 3, 1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, -1, 1, 2, -1, 1, -1, -1, -1, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, -1, 4, 4, 2, 2, 2, 2, 3, 3, 3, 4, 4,
4, 4, 5, 5, 5, 5, -1, 6, 3, 6, 6, 6, 6, 6, 6, -1, 7,
7, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, -1, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, -1, -1, -1, 3, 3, 3, 6, 7, 7,
3, -1, -1, -1, 7, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
8, 8, 8, -1, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-1, -1, 9, 9, -1, 9, 9, 9, 9, 9, 9, 9, 9, -1, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, -1, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, -1,
9, 9, 9, 9, 9, 9, -1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, -1, -1, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, -1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 10, 10, 10, 10, 10, -1, 10, -1, -1, -1, 11, -1, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1, 10, -1, 10, 10, 11,
11, 11, -1, -1, 11, 11, 11, 11, 11, -1, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, -1, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
11, 11, -1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, -1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1,
-1, -1, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-1, 13, 13, 13, 13, 13, 13, 13, 13, -1, 13, 13, 13, 13, 13, 13, -1,
13, -1, -1, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14]), array([-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0,
0, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1, -1, -1, -1, 1, 1, 1,
1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, -1, 2, 2, 1, 1, 1, 1, 1, 1,
1, 2, 2, -1, -1, -1, -1, 3, 2, 2, 3, 3, 3, 3, 3, 3, 3,
3, -1, -1, -1, -1, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, -1,
-1, 3, 3, 3, 3, 3, 3, 3, 3, 3, -1, -1, -1, -1, -1, -1, -1,
-1, 3, 3, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-1, -1, -1, -1, -1, -1, 5, -1, 5, 5, 5, 5, 5, 5, -1, -1, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, -1, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, -1,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, -1, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, -1, 5, -1, 5, 5, 5, -1, 5, 5, 5, 5, 5, 5,
5, 6, 6, -1, 6, -1, -1, -1, -1, -1, 7, 7, -1, -1, 6, 6, 6,
6, 6, 6, 6, 6, 6, 6, -1, -1, 6, 6, -1, -1, -1, -1, 6, 8,
8, 6, -1, -1, -1, 7, 7, 7, 7, 7, 7, 7, 9, -1, 8, 8, 8,
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9,
-1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 8, 8, 8, 8, 8, 8, -1, -1, -1, -1, 9, 9, 9,
9, -1, -1, -1, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
10, 10, 10, 10, 10, 10, 10, 11, 10, 10, 11, 11, 11, 11, 11, 11, -1,
11, 12, 12, 12, 12, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, -1, 10, 10, 10,
10, 10, 10, 10, 10, 10, -1, 10, 10, 10, 10, 10, 10, 10, 12, 10, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, -1, -1, -1, -1, -1,
12, 12, 10, 10, -1, -1, -1, -1, -1, -1, -1, 10, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, -1, 13, 13, -1,
-1, 13, 13, 13, 13, 13, 13, 13, 13, -1, 13, 13, 13, 13, 13, 13, -1,
13, 13, -1, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14])]}
================================
{15: [{'eps': 1.0547224554276167, 'minpts': 5.916243654822335}, {'eps': 1.238569836383374, 'minpts': 8.055837563451776}]}
上述结果显示了Multi-DBSCAN在15聚类情况下的2种可能
时序聚类测试
再测试时序聚类,选取东海、渤海、黄海等海面温等数据,样本比较少,60多个,每个样本单要素,2800+时间点(已降维成100),形成shape=(60+,100)的数据
import time
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics.pairwise as pairwise
from sklearn.cluster import DBSCAN
from common.my_clustering import Adapter_DBSCAN
from sklearn.metrics.cluster import \
silhouette_score as sklearn_silhouette_score
# 自动根据聚类个数布局行列
def multip_decomposition(num_cluster):
s = math.sqrt(num_cluster)
col = math.ceil(s)
row = int(s)
flag = False
while(col * row < num_cluster):
if flag:
row += 1
flag = True
else:
col += 1
flag = False
return row, col
def test_multi_dbscan(config):
X = np.loadtxt(config["data_path"],dtype=np.float32)
adb = Adapter_DBSCAN(num_cluster_range=(2,30))
t0 = time.time()
adb.fit(X)
t1 = time.time()
predict_dict = adb.get_predict_dict()
info_dict = adb.get_info_dict()
for num_cluster, labels, infos in zip(predict_dict.keys(),predict_dict.values(),info_dict.values()):
for possible,y_pred in enumerate(labels):
# 时序图横着放更好看,所以是col,row而不是row,col
col,row = multip_decomposition(num_cluster + 2)
y_pred_valid = []
X_valid = []
for yi in range(num_cluster):
plt.subplot(row, col, yi + 1)
clusters_yi = X[y_pred == yi]
for xs in clusters_yi:
X_valid.append(xs.tolist())
y_pred_valid.append(yi)
for xx in clusters_yi:
plt.plot(xx.ravel(), "k-", alpha=.3)
plt.plot(np.mean(clusters_yi,axis=0).ravel(), "r-")
X_valid = np.array(X_valid)
y_pred_valid = np.array(y_pred_valid)
print(X_valid.shape)
print(y_pred_valid.shape)
plt.subplot(row, col, num_cluster + 1)
clusters_noise = X[y_pred == -1]
for xx in clusters_noise:
plt.plot(xx.ravel(), "k-", alpha=.3)
plt.plot(np.mean(clusters_noise,axis=0).ravel(), "r-")
plt.text(0.55, 0.85,'noise',
transform=plt.gca().transAxes)
plt.subplot(row, col, num_cluster + 2)
clusters_uncertain = X[y_pred == -2]
for xx in clusters_uncertain:
plt.plot(xx.ravel(), "k-", alpha=.3)
plt.plot(np.mean(clusters_uncertain,axis=0).ravel(), "r-")
plt.text(0.55, 0.85,'uncertain',
transform=plt.gca().transAxes)
plt.tight_layout()
# prefix, suffix = filepath.split(".")
# prefix = data_process.exp_replace(prefix,"exp4")
# img_save_path = "{}_{}_{}minpts_{}clusters_possible{}.png".format(\
# prefix,config["test_name"],"%.2f" % infos[possible]["minpts"],num_cluster,possible)
# logger.log_img("聚类结果图: {}".format(img_save_path),img_save_path)
# 自己找地方保存plt的图像
# 轮廓系数
t2 = time.time()
dists = pairwise.pairwise_distances(X_valid,metric=config["cdists_method"],n_jobs=-1)
score = sklearn_silhouette_score(dists,y_pred_valid,metric=config["method"])
t3 = time.time()
# logger.log("silhouette_score="+str(score)+", time cost: "+str(t3 - t2)+"s\n\n")
# 自己找地方保存轮廓系数
# 测试
filepath=...
config = {"method":"precomputed","cdists_method":"l2", "test_name":"multidbscan", "data_path":filepath,"seed":0}
test_multi_dbscan(config)
由于该部分数据海域选取的地理点较近,且样本较少,较杂且较难区分
实验结果:
实验日志显示除noise外的数据仍有较好的轮廓系数0.5+,说明聚类结果算理想(相比于KMeans系列轮廓系数只有0.2+):