基于密度聚类的探索~~~~
from sklearn.cluster import DBSCAN
from sklearn import preprocessing
import pandas as pd
import numpy as np
from sklearn.manifold import MDS
import matplotlib.pyplot as plt
from collections import Counter
df = pd.read_excel('D:/Acompany/Abusiness/test.xlsx')
x = df.iloc[:,1:-1].values
y = df.iloc[:,-1:].values.ravel()
# 数据标准化处理
scaler = preprocessing.StandardScaler()
x_norm = scaler.fit_transform(x)
#寻找最佳参数eps,min_samples
p=pd.DataFrame()
for j in np.linspace(0.1,5,30):
for i in range(20,80):
clf = DBSCAN(eps=j, min_samples=i).fit(x_norm)
#clf.labels_
a=Counter(clf.labels_)[-1]
b=len(set(clf.labels_)) #查看噪点和簇数
lst=[[j,i,a,b]]
p=p.append(lst,ignore_index=True) #df的append需要接一下,list会直接原位覆盖
# print('参数:%d,参数:%d,噪点数:%d,簇数:%d'%(j,i,a,b))
#找到离群点最小,簇类最大的参数组合
clf = DBSCAN(eps=4.15517, min_samples=22).fit(x_norm)
#MDS降维后可视化
emb = MDS(n_components=2)
x_transformed = emb.fit_transform(x_norm)
plt.scatter(x_transformed[:,0],x_transformed[:,1],c=clf.labels_)
花絮:官方文档说DBSCAN算法最麻烦的就是找到eps和min_samples参数取值。翻山越岭的找到如下:
**解释一:
**参考文献:https://www.datanovia.com/en/lessons/dbscan-density-based-clustering-essentials/
解释二:
参考文献 https://www.codenong.com/12893492/
尝试如下:
install.packages('dbscan')
install.packages('readxl')
library(dbscan)
library(readxl)
data <- read_excel('D:/Acompany/Abusiness/test.xlsx',sheet=1)
data <- scale(data)
dbscan::kNNdistplot(a, k = 22)
差不多看出来K=22 eps=1.22