K-means:该算法假设每个簇由具有连续特征的相似点构成,可以用相似的质心(平均值)和中心点来表示。应对密集、球状数据集时有着更为突出的效果。
# K-means
from sklearn.datasets import make_blobs
# 生成数据集
X, y = make_blobs(n_samples=150,
n_features=2,
centers=3,
cluster_std=0.5,
shuffle=True,
random_state=0)
# 绘制数据集
import matplotlib.pyplot as plt
plt.scatter(X[:, 0],
X[:, 1],
c='white',
marker='o',
edgecolor='black',
s=50)
plt.grid()
plt.tight_layout()
plt.show()
from sklearn.cluster import KMeans
#训练与预测
km = KMeans(n_clusters=3,
init='random',
n_init=10,
max_iter=300,
tol=1e-04,
random_state=0)
y_km = km.fit_predict(X)
# 可视化
plt.scatter(X[y_km == 0, 0],
X[y_km == 0, 1],
s=50, c='lightgreen',
marker='s', edgecolor='black',
label='Cluster 1')
plt.scatter(X[y_km == 1, 0],
X[y_km == 1, 1],
s=50, c='orange',
marker='o', edgecolor='black',
label='Cluster 2')
plt.scatter(X[y_km == 2, 0],
X[y_km == 2, 1],
s=50, c='lightblue',
marker='v', edgecolor='black',
label='Cluster 3')
plt.scatter(km.cluster_centers_[:, 0],
km.cluster_centers_[:, 1],
s=250, marker='*',
c='red', edgecolor='black',
label='Centroids')
plt.legend(scatterpoints=1)
plt.grid()
plt.tight_layout()
plt.show()
层次聚类 Hierarchical Clustering:通过绘制树状图,将聚类过程进行可视化;不需要预先指定簇的个数,在一些无法预先获悉样本分布特点的数据上具有明显优势。
#hierarchical
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage
#dataSet
np.random.seed(123)
variables = ['X', 'Y', 'Z']
labels = ['ID_0', 'ID_1', 'ID_2', 'ID_3', 'ID_4']
X = np.random.random_sample([5, 3]) * 10
df = pd.DataFrame(X, columns=variables, index=labels)
from scipy.spatial.distance import pdist, squareform
row_dist = pd.DataFrame(squareform(pdist(df, metric='euclidean')), columns=labels, index=labels)
row_clusters = linkage(pdist(df, metric='euclidean'), method='complete')
pd.DataFrame(row_clusters,
columns=['row label 1',
'row label 2',
'distance',
'no. of items in clust.'],
index=['cluster %d' % (i + 1) for i in range(row_clusters.shape[0])])
from scipy.cluster.hierarchy import dendrogram
row_dendr = dendrogram(row_clusters, labels=labels)
plt.tight_layout()
plt.ylabel('Euclidean distance')
plt.show()
密度聚类DBSCAN:用密度取代数据的相似性,按照数据样本点的发布密度差异,将样本点密度足够大的区域联结在一起,以期能够发现任意形状的簇。不需要预设聚类个数,也不需要手动选择聚类层级。
# DBSCAN
from sklearn.datasets import make_moons
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
# dataSet
X, y = make_moons(n_samples=200,
noise=0.05,
random_state=0)
plt.scatter(X[:, 0], X[:, 1])
plt.tight_layout()
plt.show()
三种聚类结果展示:
# K-means
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(8, 3))
km = KMeans(n_clusters=2, random_state=0)
y_km = km.fit_predict(X)
ax1.scatter(X[y_km == 0, 0],
X[y_km == 0, 1],
c='lightblue',
edgecolor='black',
marker='o',
s=40,
label='cluster 1')
ax1.scatter(X[y_km == 1, 0],
X[y_km == 1, 1],
c='red',
edgecolor='black',
marker='s',
s=40,
label='cluster 2')
ax1.set_title('K-means clustering')
# hierarchical
ac = AgglomerativeClustering(n_clusters=2, metric='euclidean', linkage='complete')
y_ac = ac.fit_predict(X)
ax2.scatter(X[y_ac == 0, 0],
X[y_ac == 0, 1],
c='lightblue',
edgecolor='black',
marker='o',
s=40,
label='Cluster 1')
ax2.scatter(X[y_ac == 1, 0],
X[y_ac == 1, 1],
c='red',
edgecolor='black',
marker='s',
s=40,
label='Cluster 2')
ax2.set_title('Agglomerative clustering')
plt.legend()
plt.tight_layout()
plt.show()
# DBSCAN
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=0.2,
min_samples=5,
metric='euclidean')
y_db = db.fit_predict(X)
plt.scatter(X[y_db == 0, 0],
X[y_db == 0, 1],
c='lightblue',
edgecolor='black',
marker='o',
s=40,
label='Cluster 1')
plt.scatter(X[y_db == 1, 0],
X[y_db == 1, 1],
c='red',
edgecolor='black',
marker='s',
s=40,
label='Cluster 2')
plt.legend()
plt.tight_layout()
plt.show()