对连续变量和分类变量混合的数据进行聚类,用了两种方法,k-prototypes和gower距离+kmeans,两种方法都是python直接编写没调包。
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei'] # 绘图时可以显示中文
plt.rcParams['axes.unicode_minus']=False # 绘图时显示负号
warnings.filterwarnings("ignore") # 不要显示警告
heart = pd.read_csv('C:\\Users\\91333\\Documents\\semester6\\data science\\6.聚类分析\\heart.dat', header = None, sep = ' ')
可视化
sns.stripplot(x=heart.iloc[:,13],y=heart.iloc[:,3],hue=heart.iloc[:,1])
sns.boxplot(x=heart.iloc[:,2],y=heart.iloc[:,4],hue=heart.iloc[:,13])
sns.violinplot(x=heart.iloc[:,5],y=heart.iloc[:,7],hue=heart.iloc[:,13])
#对连续型数据标准化,减小量纲对聚类的影响。
#第十一个变量为顺序型变量,有大小关系,在这里我把它看成连续型变量
Numerical = [0,3,4,7,9,10,11]
Type = [1,2,5,6,8,12,13]
heart_norm0 = heart.iloc[:, Numerical].apply(lambda x: (x - np.mean(x)) / (np.std(x)))
heart_norm = heart
heart_norm.iloc[:,Numerical] = heart_norm0
方法一:使用k-means的变体k-prototypes聚类
k-prototypes代码,来源网络他人智慧
原链接:https://blog.csdn.net/littlely_ll/article/details/80042928?utm_medium=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase&depth_1-utm_source=distribute.pc_relevant.none-task-blog-BlogCommendFromMachineLearnPai2-1.nonecase
我似乎稍微改了一下下
prototype是考研词汇哦,意思是 原型
import numpy as np
import random
from collections import Counter
def dist(x, y):
return np.sqrt(sum((x-y)**2))
def sigma(x, y):
return len(x) - sum(x == y)
def KPrototypes(data, O, C, k, max_iters=10, gamma=0):
data = np.array(data)
m, n = data.shape
num = random.sample(range(m), k)
O_data = data[:, O]
C_data = data[:, C]
O_protos = O_data[num, :]
C_protos = C_data[num, :]
C_data = C_data.astype("str")
C_protos = C_protos.astype("str")
cluster = None
clusterShip = []
clusterCount = {}
sumInCluster = {}
freqInCluster = {}
for i in range(m):
mindistance = float('inf')
for j in range(k):
distance = dist(O_data[i,:], O_protos[j,:]) + \
gamma * sigma(C_data[i,:], C_protos[j,:])
if distance < mindistance:
mindistance = distance
cluster = j
clusterShip.append(cluster)
if clusterCount.get(cluster) == None:
clusterCount[cluster] = 1
else:
clusterCount[cluster] += 1
for j in range(len(O)):
if sumInCluster.get(cluster) == None:
sumInCluster[cluster] = [O_data[i,j]] + [0] * (len(O) - 1)
else:
sumInCluster[cluster][j] += O_data[i,j]
O_protos[cluster,j] = sumInCluster[cluster][j] / clusterCount[cluster]
for j in range(len(C)):
if freqInCluster.get(cluster) == None:
freqInCluster[cluster] = [Counter(C_data[i,j])] + [Counter()] * (len(C) - 1)
else:
freqInCluster[cluster][j] += Counter(C_data[i,j])
C_protos[cluster,j] = freqInCluster[cluster][j].most_common()[0][0]
for t in range(max_iters):
for i in range(m):
mindistance = float('inf')
for j in range(k):
distance = dist(O_data[i, :], O_protos[j, :]) + \
gamma * sigma(C_data[i, :], C_protos[j, :])
if distance < mindistance:
mindistance = distance
cluster = j
if clusterShip[i] != cluster:
oldCluster = clusterShip[i]
clusterShip[i] = cluster
clusterCount[cluster] += 1
clusterCount[oldCluster] -= 1
for j in range(len(O)):
sumInCluster[cluster][j] += O_data[i,j]
sumInCluster[oldCluster][j] -= O_data[i,j]
O_protos[cluster,j] = sumInCluster[cluster][j] / clusterCount[cluster]
O_protos[oldCluster, j] = sumInCluster[oldCluster][j] / clusterCount[oldCluster]
for j in range(len(C)):
freqInCluster[cluster][j] += Counter(C_data[i,j])
freqInCluster[oldCluster][j] -= Counter(C_data[i,j])
C_protos[cluster,j] = freqInCluster[cluster][j].most_common()[0][0]
C_protos[oldCluster,j] = freqInCluster[oldCluster][j].most_common()[0][0]
return clusterShip
调用k-prototypes函数,调节簇个数和分类变量权重两个参数,记录评价指标得分
ars=[]
sc=[]
for i in range(2,5):
ars_sub=[]
sc_sub=[]
for j in [0,0.5,1,1.5,2]:
cluster = KPrototypes(data=heart_norm, O=Numerical, C=Type, k=i, gamma=j, max_iters=100)
ars_sub.append(metrics.adjusted_rand_score(heart.iloc[:,13], cluster))
sc_sub.append(metrics.silhouette_score(heart_norm, cluster, metric='euclidean'))
ars.append(ars_sub)
sc.append(sc_sub)
ars # 可以从数值看出,效果不好
sc
方法二:kmeans + gower距离
gower距离代码
from scipy.sparse import issparse
import numpy as np
import pandas as pd
def gower_matrix(data_x, data_y=None, weight=None, cat_features=None):
# function checks
X = data_x
if data_y is None: Y = data_x
else: Y = data_y
if not isinstance(X, np.ndarray):
if not np.array_equal(X.columns, Y.columns): raise TypeError("X and Y must have same columns!")
else:
if not X.shape[1] == Y.shape[1]: raise TypeError("X and Y must have same y-dim!")
if issparse(X) or issparse(Y): raise TypeError("Sparse matrices are not supported!")
x_n_rows, x_n_cols = X.shape
y_n_rows, y_n_cols = Y.shape
if cat_features is None:
if not isinstance(X, np.ndarray):
is_number = np.vectorize(lambda x: not np.issubdtype(x, np.number))
cat_features = is_number(X.dtypes)
else:
cat_features = np.zeros(x_n_cols, dtype=bool)
for col in range(x_n_cols):
if not np.issubdtype(type(X[0, col]), np.number):
cat_features[col]=True
else:
cat_features = np.array(cat_features)
# print(cat_features)
if not isinstance(X, np.ndarray): X = np.asarray(X)
if not isinstance(Y, np.ndarray): Y = np.asarray(Y)
Z = np.concatenate((X,Y))
x_index = range(0,x_n_rows)
y_index = range(x_n_rows,x_n_rows+y_n_rows)
Z_num = Z[:,np.logical_not(cat_features)]
num_cols = Z_num.shape[1]
num_ranges = np.zeros(num_cols)
num_max = np.zeros(num_cols)
for col in range(num_cols):
col_array = Z_num[:, col].astype(np.float32)
max = np.nanmax(col_array)
min = np.nanmin(col_array)
if np.isnan(max):
max = 0.0
if np.isnan(min):
min = 0.0
num_max[col] = max
num_ranges[col] = (1 - min / max) if (max != 0) else 0.0
# This is to normalize the numeric values between 0 and 1.
Z_num = np.divide(Z_num ,num_max,out=np.zeros_like(Z_num), where=num_max!=0)
Z_cat = Z[:,cat_features]
if weight is None:
weight = np.ones(Z.shape[1])
#print(weight)
weight_cat=weight[cat_features]
weight_num=weight[np.logical_not(cat_features)]
out = np.zeros((x_n_rows, y_n_rows), dtype=np.float32)
weight_sum = weight.sum()
X_cat = Z_cat[x_index,]
X_num = Z_num[x_index,]
Y_cat = Z_cat[y_index,]
Y_num = Z_num[y_index,]
# print(X_cat,X_num,Y_cat,Y_num)
for i in range(x_n_rows):
j_start= i
if x_n_rows != y_n_rows:
j_start = 0
# call the main function
res = gower_get(X_cat[i,:],
X_num[i,:],
Y_cat[j_start:y_n_rows,:],
Y_num[j_start:y_n_rows,:],
weight_cat,
weight_num,
weight_sum,
cat_features,
num_ranges,
num_max)
#print(res)
out[i,j_start:]=res
if x_n_rows == y_n_rows: out[i:,j_start]=res
return out
def gower_get(xi_cat,xi_num,xj_cat,xj_num,feature_weight_cat,
feature_weight_num,feature_weight_sum,categorical_features,
ranges_of_numeric,max_of_numeric ):
# categorical columns
sij_cat = np.where(xi_cat == xj_cat,np.zeros_like(xi_cat),np.ones_like(xi_cat))
sum_cat = np.multiply(feature_weight_cat,sij_cat).sum(axis=1)
# numerical columns
abs_delta=np.absolute(xi_num-xj_num)
sij_num=np.divide(abs_delta, ranges_of_numeric, out=np.zeros_like(abs_delta), where=ranges_of_numeric!=0)
sum_num = np.multiply(feature_weight_num,sij_num).sum(axis=1)
sums= np.add(sum_cat,sum_num)
sum_sij = np.divide(sums,feature_weight_sum)
return sum_sij
改写Kmeans
初始化聚类中心
import copy
k=2 #设定聚类个数
cat = [0,1,1,0,0,1,1,0,1,0,0,0,1,1]
np.random.seed(126)
x = np.array(heart_norm)
SampleIndex=np.random.randint(0,x.shape[0],size=k) #随机抽取3个样本作为初始聚类中心
Center=x[SampleIndex,] #初始聚类中心
更新聚类中心
COld=np.zeros(Center.shape) #老的聚类中心
CLabel=np.zeros(x.shape[0],dtype=np.int32) #聚类标签0,1,整型
Error=gower_matrix(np.append(Center, COld, axis=0),cat_features=cat).diagonal(offset=k)
开始迭代循环
while np.all(Error != 0):
for i in range(x.shape[0]): #给每个样本分配距离最近的聚类中心分配标签
dist=gower_matrix(np.append(x[i].reshape(1,14),Center, axis=0),cat_features=cat)[(0,0),(1,2)]
Label=np.argmin(dist) #把dist中距离最小的所在位置(即0,1)赋给标签
CLabel[i]=Label #将标签赋给对应的样本
COld=copy.deepcopy(Center) #存储原来的聚类中心
#更新新的聚类中心
for i in range(k):
Index=CLabel==i #找出相同标签的样本
Center[i]=np.mean(x[Index],axis=0) #更新聚类中心
Error=gower_matrix(np.append(Center, COld, axis=0),cat_features=cat).diagonal(offset=k)
模型评价
print(metrics.adjusted_rand_score(heart.iloc[:,13], CLabel))
print(metrics.silhouette_score(heart_norm, CLabel, metric='euclidean'))