coding=utf-8
“”"
author:lei
function:
“”"
import pandas as pd
from sklearn.cluster import KMeans
beer = pd.read_csv(“data.txt”, sep="")
X = beer[[“calories”, “sodium”, “alcohol”, “cost”]]
km = KMeans(n_clusters=3).fit(X)
km2 = KMeans(n_clusters=2).fit(X)
beer[“cluster”] = km.labels_
beer[“cluster2”] = km2.labels_
beer.sort_values[“cluster”] # 按照cluster对数据进行排序
cluster_centers = km.cluster_centers_
cluster_centers_2 = km2.cluster_centers_
beer.groupby(“cluster”).mean() # 聚类分析三个类别的各项平均值
beer.groupby(“cluster2”).mean()
centers = beer.groupby(“cluster”).mean().reset_index()
画图
import matplotlib.pyplot as plt
plt.rcParams[“font.size”] = 14
import numpy as np
colors = np.array([“red”, “green”, “blue”, “yellow”])
plt.scatter(beer[“calories”], beer[“alcohol”], c=colors[beer[“cluster”]])
plt.scatter(centers.calories, centers.alcohol, linewidths=3, marker="+", s=300, c=“black”)
plt.xlabel(“Calories”)
plt.ylabel(“Alcohol”)
标准化
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
X_scaled = scalar.fit_transform(X)
km = KMeans(n_clusters=3).fit(X_scaled)
beer[“scaled_cluster”] = km.labels_
beer.sort_values[“scaled_cluster”]
聚类评估
from sklearn import metrics
轮廓系数
score_scaled = metrics.silhouette_score(X, beer.scaled_cluster)
score = metrics.silhouette_score(X, beer.cluster) # 标准化结果
print(score_scaled, score)
scores = []
for k in range(2, 200):
labels = KMeans(n_clusters=k).fit(X).labels_
score = metrics.silhouette_score(X, labels)
scores.append(score)
DBSCAN
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=10, min_samples=2).fit(X) # eps是半径 min_samples是最小的个数
labels = db.labels_ # 获取到聚类结果
beer[“cluster_db”] = labels # 添加一列数据
beer.sort_values[“cluster_db”]