练习数据
name calories sodium alcohol cost
Budweiser 144 15 4.7 0.43
Schlitz 151 19 4.9 0.43
Lowenbrau 157 15 0.9 0.48
Kronenbourg 170 7 5.2 0.73
Heineken 152 11 5.0 0.77
Old_Milwaukee 145 23 4.6 0.28
Augsberger 175 24 5.5 0.40
Srohs_Bohemian_Style 149 27 4.7 0.42
Miller_Lite 99 10 4.3 0.43
Budweiser_Light 113 8 3.7 0.40
Coors 140 18 4.6 0.44
Coors_Light 102 15 4.1 0.46
Michelob_Light 135 11 4.2 0.50
Becks 150 19 4.7 0.76
Kirin 149 6 5.0 0.79
Pabst_Extra_Light 68 15 2.3 0.38
Hamms 139 19 4.4 0.43
Heilemans_Old_Style 144 24 4.9 0.43
Olympia_Goled_Light 72 6 2.9 0.46
Schlitz_Light 97 7 4.2 0.47
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
%matplotlib inline
beer = pd.read_csv(r"...\data\beer_data.txt", sep=' ')
X = beer[['calories', 'sodium', 'alcohol', 'cost']]
K-Means clustering
km = KMeans(n_clusters=3).fit(X) # 分3簇
# 将训练结果保存到beer
beer['cluster'] = km.labels_
# 平均值 ndarray格式
# centers = km.cluster_centers_
# 平均值 dataframe格式
centers = beer.groupby('cluster').mean().reset_index()
# 绘图样式
plt.rcParams['font.size'] = 14
colors = np.array(['red', 'green', 'blue', 'yellow'])
# 画data
plt.scatter(beer['calories'], beer['alcohol'], c=colors[beer['cluster']])
# 画中心点
plt.scatter(centers['calories'], centers['alcohol'], marker='+', s=150, c='black')
plt.xlabel('Calories')
plt.ylabel('Alcohol')
矩阵散点图scatter_matrix
from pandas.plotting import scatter_matrix
scatter_matrix(beer[['calories', 'sodium', 'alcohol', 'cost']], s=100, alpha=1, c=colors[beer['cluster']], figsize=(10,10))
plt.suptitle('With 2 centroids initialized')
标准化处理
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X) # ndarray格式
km = KMeans(n_clusters=3).fit(X_scaled)
beer['scaled_cluster'] = km.labels_
聚类评估:轮廓系数(Silhouette Coefficient)
s ( i ) = b ( i ) − a ( i ) max { a ( i ) , b ( i ) } s ( i ) = { 1 − a ( i ) b ( i ) , a ( i ) < b ( i ) 0 , a ( i ) = b ( i ) b ( i ) a ( i ) − 1 , a ( i ) > b ( i ) s(i) = \frac{b(i)-a(i)} {\max\{a(i), b(i)\}} \ \ \ \ \ \ \ s(i) = \left\{ \begin{aligned} 1 - \frac{a(i)}{b(i)}, && a(i) < b(i) \\ 0, && a(i) = b(i) \\ \frac{b(i)}{a(i)} - 1, && a(i) > b(i) \end{aligned} \right. s(i)=max{a(i),b(i)}b(i)−a(i) s(i)=⎩⎪⎪⎪⎪⎪⎨⎪⎪⎪⎪⎪⎧1−b(i)a(i),0,a(i)b(i)−1,a(i)<b(i)a(i)=b(i)a(i)>b(i)
- 计算样本i到同簇其他样本的平均距离ai。ai 越小,说明样本i越应该被聚类到该簇。将ai 称为样本i的簇内不相似度。
- 计算样本i到其他某簇Cj 的所有样本的平均距离bij,称为样本i与簇Cj 的不相似度。定义为样本i的簇间不相似度:bi =min{bi1, bi2, …, bik}
- si接近1,则说明样本i聚类合理
- si接近-1,则说明样本i更应该分类到另外的簇
- 若si 近似为0,则说明样本i在两个簇的边界上。
from sklearn import metrics
score_scaled = metrics.silhouette_score(X, beer['scaled_cluster'])
score = metrics.silhouette_score(X, beer['cluster'])
print(score_scaled, score)
0.1797806808940007 0.6731775046455796
说明标准化处理不是万能的
求n_clusters的最佳值
scores = []
for i in range(2, 20):
# KMeans分类结果
labels = KMeans(n_clusters=i).fit(X).labels_
# 轮廓系数评估
score = metrics.silhouette_score(X, labels)
scores.append(score)
# 可视化
plt.plot(range(2, 20), scores)
plt.xlabel("Number of Clusters Initialized")
plt.ylabel("Silhouette Score")
DBSCAN clutering
from sklearn.cluster import DBSCAN
# eps半径 min_samples最小个数
db = DBSCAN(eps=10, min_samples=2).fit(X)
beer['cluster_db'] = db.labels_
# 绘制矩阵散点图
scatter_matrix(beer[['calories', 'sodium', 'alcohol', 'cost']], s=100, alpha=1, c=colors[beer['cluster_db']], figsize=(10,10))
求eps, min_samples最佳值、
scores = []
for eps in range(2, 26): # 26以后labels=[0,0,0...]
labels = DBSCAN(eps=eps, min_samples=2).fit(X).labels_
score = metrics.silhouette_score(X, labels)
scores.append(score)
plt.plot(range(2, 26), scores)
plt.xlabel('Number of eps')
plt.ylabel('Sihouette Score')
print(max(scores))
0.6731775046455796
scores = []
for k in range(2, 14): # 14以后labels=[-1,-1...]
labels = DBSCAN(eps=20, min_samples=k).fit(X).labels_
score = metrics.silhouette_score(X, labels)
scores.append(score)
plt.plot(range(2, 14), scores)
plt.xlabel('Number of min_samples')
plt.ylabel('Sihouette Score')
print(max(scores))
0.6731775046455796