记录一下利用两种聚类方法K-Means和DBSCAN对搜集到的啤酒集进行聚类。
导库读数据
# beer dataset 啤酒集
import pandas as pd
beer = pd.read_csv('data.txt', sep=' ')
beer
X = beer[["calories","sodium","alcohol","cost"]] #取出标签
K-means clustering
from sklearn.cluster import KMeans #导入库
km = KMeans(n_clusters=3).fit(X) #3个簇
km2 = KMeans(n_clusters=2).fit(X) #2个簇
km.labels_ #0、1、2表示有三个类别
array([0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 1, 0, 0, 1, 2])
#按簇排序
beer['cluster'] = km.labels_
beer['cluster2'] = km2.labels_
beer.sort_values('cluster')
#from pandas.tools.plotting import scatter_matrix
#注意这里要用新库,旧的pandas.tools取消了
from pandas.plotting import scatter_matrix
%matplotlib inline
cluster_centers = km.cluster_centers_
cluster_centers_2 = km2.cluster_centers_
beer.groupby("cluster").mean() #看下每个簇的均值的差异
beer.groupby("cluster2").mean()
centers = beer.groupby("cluster").mean().reset_index() #取出中心点
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
import numpy as np
colors = np.array(['red', 'green', 'blue', 'yellow'])
plt.scatter(beer["calories"], beer["alcohol"],c=colors[beer["cluster"]])
plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker='+', s=300, c='black')
plt.xlabel("Calories")
plt.ylabel("Alcohol")
#看下两两个指标之间的聚类结果
scatter_matrix(beer[["calories","sodium","alcohol","cost"]],s=100, alpha=1, c=colors[beer["cluster"]], figsize=(10,10))
plt.suptitle("With 3 centroids initialized")
scatter_matrix(beer[["calories","sodium","alcohol","cost"]],s=100, alpha=1, c=colors[beer["cluster2"]], figsize=(10,10))
plt.suptitle("With 2 centroids initialized")
Scaled data 标准化数据
#想让效果更好,先做下标准化处理0~1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled
km = KMeans(n_clusters=3).fit(X_scaled)
beer["scaled_cluster"] = km.labels_
beer.sort_values("scaled_cluster")
What are the “characteristics” of each cluster?
beer.groupby("scaled_cluster").mean()
scatter_matrix(X, c=colors[beer.scaled_cluster], alpha=1, figsize=(10,10), s=100)
聚类评估:轮廓系数(Silhouette Coefficient )
- 计算样本i到同簇其他样本的平均距离ai。ai 越小,说明样本i越应该被聚类到该簇。将ai 称为样本i的簇内不相似度。
- 计算样本i到其他某簇Cj 的所有样本的平均距离bij,称为样本i与簇Cj 的不相似度。定义为样本i的簇间不相似度:bi =min{bi1, bi2, …, bik}
- si接近1,则说明样本i聚类合理
- si接近-1,则说明样本i更应该分类到另外的簇
- 若si 近似为0,则说明样本i在两个簇的边界上。
from sklearn import metrics
#siilh_score是轮廓系数得分值,传标签和聚类完的结果
#对比是否做标准化后的结果
score_scaled = metrics.silhouette_score(X,beer.scaled_cluster)
score = metrics.silhouette_score(X,beer.cluster)
print(score_scaled, score)
0.1797806808940007 0.6731775046455796
发现做了标准化反而低了!平时不一定非要做标准化,要根据实际的指标重要性情况!
scores = []
for k in range(2,20):
labels = KMeans(n_clusters=k).fit(X).labels_
score = metrics.silhouette_score(X, labels)
scores.append(score)
scores
[0.6917656034079486,
0.6731775046455796,
0.5857040721127795,
0.422548733517202,
0.4559182167013377,
0.43776116697963124,
0.38946337473125997,
0.39746405172426014,
0.3915697409245163,
0.32472080133848924,
0.3459775237127248,
0.31221439248428434,
0.30707782144770296,
0.31834561839139497,
0.2849514001174898,
0.23498077333071996,
0.1588091017496281,
0.08423051380151177]
plt.plot(list(range(2,20)), scores)
plt.xlabel("Number of Clusters Initialized")
plt.ylabel("Sihouette Score")
发现当簇的个数为2时比较好
K-Means流程:
1.聚类
2.可视化展示
3.评估
4.看那个值比较合适,再做一次
DBSCAN clustering
from sklearn.cluster import DBSCAN #适合用在不规整的数据集上,简单数据集还用K-Means
db = DBSCAN(eps=10, min_samples=2).fit(X) #半径:10,圈内个数:2
labels = db.labels_
labels
array([ 0, 0, 0, -1, 0, 0, -1, 0, 1, -1, 0, 1, 0, 0, 0, 2, 0,
0, 2, 1], dtype=int64)
beer['cluster_db'] = labels
beer.sort_values('cluster_db')
beer.groupby('cluster_db').mean()
scatter_matrix(X, c=colors[beer.cluster_db], figsize=(10,10), s=100)
然后就是用轮廓系数评估
from sklearn import metrics
#siilh_score是轮廓系数得分值,传的数据和聚类完的结果
#对比是否做标准化后的结果
score_scaled = metrics.silhouette_score(X,beer.scaled_cluster)
score = metrics.silhouette_score(X,beer.cluster)
print(score_scaled, score)
0.1797806808940007 0.6731775046455796
scores = []
for k in range(2,20):
labels = DBSCAN(eps=k, min_samples=2).fit(X).labels_
score = metrics.silhouette_score(X, labels)
scores.append(score)
scores
[-0.03670705609846274,
-0.03670705609846274,
-0.06781609566358748,
-0.06781609566358748,
0.1626084889128696,
0.12626205982196476,
0.16564759416041527,
0.42951251219183106,
0.49530955296776086,
0.49530955296776086,
0.49530955296776086,
0.49530955296776086,
0.5857040721127795,
0.5857040721127795,
0.5238781710613801,
0.5238781710613801,
0.6731775046455796,
0.6731775046455796]
plt.plot(list(range(2,20)), scores)
plt.xlabel("eps=k, min_samples=2")
plt.ylabel("Sihouette Score")
发现当min_samples=2时,eps=20比较好
scores1 = []
for k in range(2,10):
labels = DBSCAN(eps=10, min_samples=k).fit(X).labels_
score = metrics.silhouette_score(X, labels)
scores1.append(score)
scores1
[0.49530955296776086,
0.42951251219183106,
0.4767963143919395,
0.4767963143919395,
0.4767963143919395,
0.390839395721598,
0.390839395721598,
0.21891774205673578]
plt.plot(list(range(2,10)), scores1)
plt.xlabel("eps=10, min_samples=k")
plt.ylabel("Sihouette Score")
发现当eps=10时,min_samples=4~6比较好
此处可以将两个变量做成一个2D图比较两个指标分别取什么值时是最好的。