聚类3件套
- 聚类前用 霍普金斯评价指标评估数据能否聚类
- 数据归一化后用k-means聚类
3.轮廓系数评估聚类效果
# @Time: 2022/1/26 14:27
# @Author:tang
# @File: 聚类3件套.py
# @Des: 1.评估有误必要聚类,用霍普金斯统计量;2.sklearn中k均值聚类;3.sklearn聚类效果评估:轮廓系数
#ref:https://blog.csdn.net/qq_16633405/article/details/119995976
#https://blog.csdn.net/weixin_39671140/article/details/114690991
#https://blog.csdn.net/wei18791957243/article/details/91360356
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score,davies_bouldin_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.cluster import KMeans
from random import sample
from numpy.random import uniform
import numpy as np
from math import isnan
import pandas as pd
from numpy.random import uniform, normal
from scipy.spatial.distance import cdist
# n维霍普金斯统计量计算,input:DataFrame类型的二维数据,output:float类型的霍普金斯统计量
# 默认从数据集中抽样的比例为0.3
def hopkins_statistic(data: pd.DataFrame, sampling_ratio: float = 0.4):
# 抽样比例超过0.1到0.5区间任意一端则用端点值代替
sampling_ratio = min(max(sampling_ratio, 0.1), 0.5)
# 抽样数量
n_samples = int(data.shape[0] * sampling_ratio)
# 原始数据中抽取的样本数据
sample_data = data.sample(n_samples)
# 原始数据抽样后剩余的数据
data = data.drop(index=sample_data.index) # ,inplace = True)
# 原始数据中抽取的样本与最近邻的距离之和
data_dist = cdist(data, sample_data).min(axis=0).sum()
# 人工生成的样本点,从平均分布中抽样(artificial generate samples)
ags_data = pd.DataFrame({col: uniform(data[col].min(), data[col].max(), n_samples) \
for col in data})
# 人工样本与最近邻的距离之和
ags_dist = cdist(data, ags_data).min(axis=0).sum()
# 计算霍普金斯统计量H
H_value = ags_dist / (data_dist + ags_dist)
return H_value
#霍普金斯的第二种实现方式
def hopkins(X):
d = X.shape[1]
#d = len(vars) # columns
n = len(X) # rows
m = int(0.3 * n) # heuristic from article [1]
nbrs = NearestNeighbors(n_neighbors=1).fit(X.values)
rand_X = sample(range(0, n, 1), m)
ujd = []
wjd = []
for j in range(0, m):
u_dist, _ = nbrs.kneighbors(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), 2, return_distance=True)
ujd.append(u_dist[0][1])
w_dist, _ = nbrs.kneighbors(X.iloc[rand_X[j]].values.reshape(1, -1), 2, return_distance=True)
wjd.append(w_dist[0][1])
H = sum(ujd) / (sum(ujd) + sum(wjd))
if isnan(H):
print (ujd, wjd)
H = 0
return H
#标准化数据
def standard_scala(data):
#标准化
# std =StandardScaler()
# data = std.fit_transform(data)
#最大最小归一化
sc = MinMaxScaler()
data = sc.fit_transform(data)
return data
#k均值聚类
def my_kmeans(data,n_clusters=2):
# 假如我要构造一个聚类数为3的聚类器
estimator = KMeans(n_clusters) # 构造聚类器
estimator.fit(data) # 聚类
label_pred = estimator.predict(data) # 获取聚类标签
centroids = estimator.cluster_centers_ # 获取聚类中心
inertia = estimator.inertia_ # 获取聚类准则的总和
# print(label_pred)
# print(centroids)
# print(inertia)
s_score = silhouette_score(data,label_pred) #越大越好
dbi = davies_bouldin_score(data,label_pred) #越小越好
return s_score,dbi
if __name__ == "__main__":
data = pd.read_csv("infos.csv")
data.drop(["vin"],axis=1, inplace=True)
#用众数填充
for col in data.columns:
data[col].fillna(data[col].mode()[0], inplace=True)
data[col].replace("\\N", 0, inplace=True)
data[col] = data[col].apply(lambda x:float(x))
#标准化
data = standard_scala(data)
# print(data[:2])
#霍普金斯统计量
res = hopkins(pd.DataFrame(data)) #hopkins_statistic(pd.DataFrame(data))
print("霍普金斯统计量为:" + str(res))
print()
#聚类,并统计轮廓系数 自动调参
for n_cluster in range(2,10):
s_score,dbi = my_kmeans(data,n_cluster)
print("当聚为{}个类时,轮廓系数为 {},dbi为 {}".format(n_cluster,s_score,dbi))