import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
beer=pd.read_csv('data.txt',encoding='gbk',sep='')
X=beer[["calories","sodium","alcohol","cost"]]
km=KMeans(n_clusters=3).fit(X)
beer['cluster']=km.labels_
centers=km.cluster_centers_
plt.rcParams['font.size']=14
colors=np.array(['red','green','blue','yellow'])
plt.scatter(beer["calories"], beer["alcohol"], c=colors[beer["cluster"]])
plt.scatter(centers[:,0], centers[:,2], linewidths=3,marker='+',s=300,c='black')
plt.xlabel("Calories")
plt.ylable("Alcohol")
plt.suptitle("Calories and Alcohol")
pd.plotting.scatter_matrix(beer[["calories", "sodium","alcohol","cost"]],s=100,alpha=1,c=colors[beer["cluster"]],figsize=(10,10))
plt.suptitle("original data")
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)
km=KMeans(n_clusters=3).fit(X_scaled)
beer["scaled_cluster"]=km.labels_
centers=km.cluster_centers_
pd.plotting.scatter_matrix(X, c=colors[beer.scaled_cluster],alpha=1,figsize=(10,10),s=100)
plt.suptitle("standard data")
score_scaled=metrics.silhouette_score(X, beer.scaled_cluster)
score=metrics.silhouette_score(X, beer.cluster)
print("得分为",score_scaled,score)
scores=[]
for k in range(2,20):
labels=KMeans(n_clusters=k).fit(X).labels_
score=metrics.silhouette_score(X, labels)
scores.append(score)
for i in range(len(scores)):
print((i+2,scores[i]))
print(max(scores[i]))
plt.figure()
plt.plot(list(range(2,20)), scores,"ro")
plt.xlabel("Number of Clusters Initialized")
plt.ylabel("Sihouette Score")
plt.suptitle("K parameter optimize")
plt.show()
scores=[]
for k in range(2,20):
labels=KMeans(n_clusters=k).fit(X_scaled).labels_
score=metrics.silhouette_score(X_scaled, labels)
scores.append(score)
for i in range(len(scores)):
print((i+2,scores[i]))