python数据分析聚类_Python数据分析与机器学习-聚类实践

源码下载:

http://download.csdn.net/download/adam_zs/10195505

24fa02b049fec3bed5db6db51826624b.png

import pandas as pd

beer = pd.read_csv("data.txt", sep=" ")

'''

name calories sodium alcohol cost

0 Budweiser 144 15 4.7 0.43

1 Schlitz 151 19 4.9 0.43

2 Lowenbrau 157 15 0.9 0.48

3 Kronenbourg 170 7 5.2 0.73

4 Heineken 152 11 5.0 0.77

'''

X = beer[["calories", "sodium", "alcohol", "cost"]]

'''K-means clustering'''

from sklearn.cluster import KMeans

km = KMeans(n_clusters=3).fit(X)

beer["cluster_3"] = km.labels_ # 分成三类

beer.sort_values("cluster_3", inplace=True)

cluster_centers = km.cluster_centers_

centers = beer.groupby("cluster_3").mean().reset_index() # 中心点

import matplotlib.pyplot as plt

import numpy as np

colors = np.array(['red', 'green', 'blue', 'yellow'])

# plt.scatter(beer["calories"], beer["alcohol"], c=colors[beer["cluster_3"]]) # 数据位置

# plt.scatter(centers["calories"], centers["alcohol"], marker='+', s=300, c='black') # 中心点位置

# plt.xlabel("Calories")

# plt.ylabel("Alcohol")

# plt.show()

from pandas.tools.plotting import scatter_matrix

scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]], s=100, alpha=1, c=colors[beer["cluster_3"]],

figsize=(10, 10))

# plt.suptitle("With 3 centroids initialized")

# plt.show()

'''数据标准化'''

from sklearn.preprocessing import StandardScaler

X_scaled = StandardScaler().fit_transform(X)

'''

[ 0.38791334 0.00779468 0.43380786 -0.45682969]

[ 0.6250656 0.63136906 0.62241997 -0.45682969]

[ 0.82833896 0.00779468 -3.14982226 -0.10269815]

[ 1.26876459 -1.23935408 0.90533814 1.66795955]

[ 0.65894449 -0.6157797 0.71672602 1.95126478]

'''

'''聚类评估:轮廓系数'''

'''

计算样本i到同簇其他样本的平均距离ai。ai 越小,说明样本i越应该被聚类到该簇。将ai 称为样本i的簇内不相似度。

计算样本i到其他某簇Cj 的所有样本的平均距离bij,称为样本i与簇Cj 的不相似度。定义为样本i的簇间不相似度:bi =min{bi1, bi2, ..., bik}

si接近1,则说明样本i聚类合理

si接近-1,则说明样本i更应该分类到另外的簇

若si 近似为0,则说明样本i在两个簇的边界上。

'''

from sklearn import metrics

# 遍历多个n_clusters,取最好的值

scores = []

for k in range(2, 20):

labels = KMeans(n_clusters=k).fit(X).labels_

score = metrics.silhouette_score(X, labels)

scores.append(score)

scores.sort(reverse=True)

for i, k in zip(range(2, 20), scores):

print(i, "\t-", k)

plt.plot(list(range(2, 20)), scores)

plt.xlabel("Number of Clusters Initialized")

plt.ylabel("Sihouette Score")

plt.show()

import pandas as pd

beer = pd.read_csv("data.txt", sep=" ")

X = beer[["calories", "sodium", "alcohol", "cost"]]

from sklearn.cluster import DBSCAN

db = DBSCAN(eps=10, min_samples=2).fit(X)

beer["cluster_db"] = db.labels_

beer.sort_values("cluster_db", inplace=True)

# print(beer.groupby("cluster_db").mean())

import numpy as np

colors = np.array(['red', 'green', 'blue', 'yellow'])

from pandas.tools.plotting import scatter_matrix

import matplotlib.pyplot as plt

scatter_matrix(beer[["calories", "sodium", "alcohol", "cost"]], s=100, alpha=1, c=colors[beer["cluster_db"]],

figsize=(10, 10))

plt.suptitle("With 3 centroids initialized")

# plt.show()

# 遍历多个n_clusters,取最好的值

scores = []

from sklearn import metrics

for eps in range(8, 15):

for min_samples in range(2, 5):

labels = DBSCAN(eps=eps, min_samples=min_samples).fit(X).labels_

score = metrics.silhouette_score(X, labels)

scores.append(str(eps) + " " + str(min_samples) + " " + str(score))

for score in scores:

print(score)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值