# Skr-Eric的机器学习课堂（六）-- 聚类

## 聚类

### 1.从人的视觉到机器的数觉

PQ=sqrt((x1-x2)^2)

PQ=sqrt((x1-x2)^2+(y1-y2)^2)

PQ=sqrt((x1-x2)^2+(y1-y2)^2+(z1-z2)^2)

N维：P(x1,x2,...,xn),Q(x1',x2',...,xn')

PQ=sqrt((x1-x1')^2+(x2-x2')^2+...+(xn-xn')^2)

### 2.K均值算法

通过性能指标优选最好的K。

以最大间距原则选择初始聚类中心。

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.cluster as sc
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x = []
with open('../../data/multiple3.txt', 'r') as f:
data = [float(substr) for substr
in line.split(',')]
x.append(data)
x = np.array(x)
model = sc.KMeans(n_clusters=4)  # K(聚类数)=4
model.fit(x)
pred_y = model.labels_
centers = model.cluster_centers_
print(sm.silhouette_score(
x, pred_y, sample_size=len(x), metric="euclidean"))
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(
np.arange(l, r, h), np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
mp.figure('K-Means Cluster', facecolor='lightgray')
mp.title('K-Means Cluster', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=pred_y, cmap='brg',
s=60)
mp.scatter(centers[:, 0], centers[:, 1], marker='+',
c='gold', s=1000, linewidth=1)
mp.show()

y = [0 0 1 1 2 2 3 3]

centers = [100 200 300 400]

image2 = [100 100 200 200 300 300 400 400]

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import scipy.misc as sm
import sklearn.cluster as sc
import matplotlib.pyplot as mp
True).astype(np.uint8)
x = image1.reshape(-1, 1)
model = sc.KMeans(n_clusters=4)
model.fit(x)
y = model.labels_
centers = model.cluster_centers_.squeeze()
image2 = centers[y].reshape(image1.shape)
model = sc.KMeans(n_clusters=3)
model.fit(x)
y = model.labels_
centers = model.cluster_centers_.squeeze()
image3 = centers[y].reshape(image1.shape)
model = sc.KMeans(n_clusters=2)
model.fit(x)
y = model.labels_
centers = model.cluster_centers_.squeeze()
image4 = centers[y].reshape(image1.shape)
mp.figure('Image Quantization', facecolor='lightgray')
mp.subplot(221)
mp.title('Original', fontsize=16)
mp.axis('off')
mp.imshow(image1, cmap='gray')
mp.subplot(222)
mp.title('Quanted-4', fontsize=16)
mp.axis('off')
mp.imshow(image2, cmap='gray')
mp.subplot(223)
mp.title('Quanted-3', fontsize=16)
mp.axis('off')
mp.imshow(image3, cmap='gray')
mp.subplot(224)
mp.title('Quanted-2', fontsize=16)
mp.axis('off')
mp.imshow(image4, cmap='gray')
mp.tight_layout()
mp.show()

### 3.均值漂移算法

K均值算法的聚类中心——几何中心---------\ 基于中心

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.cluster as sc
import matplotlib.pyplot as mp
x = []
with open('../../data/multiple3.txt', 'r') as f:
data = [float(substr) for substr
in line.split(',')]
x.append(data)
x = np.array(x)
bw = sc.estimate_bandwidth(
x, n_samples=len(x), quantile=0.1)
model = sc.MeanShift(bandwidth=bw, bin_seeding=True)
model.fit(x)
pred_y = model.labels_
centers = model.cluster_centers_
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(
np.arange(l, r, h), np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
mp.figure('Mean Shift Cluster', facecolor='lightgray')
mp.title('Mean Shift Cluster', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=pred_y, cmap='brg',
s=60)
mp.scatter(centers[:, 0], centers[:, 1], marker='+',
c='gold', s=1000, linewidth=1)
mp.show()

### 4.凝聚层次算法

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.cluster as sc
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x = []
# with open('../../data/multiple3.txt', 'r') as f:
with open('../../data/perf.txt', 'r') as f:
data = [float(substr) for substr
in line.split(',')]
x.append(data)
x = np.array(x)
clstrs, scores, models = np.arange(2, 11), [], []
for n_clusters in clstrs:
model = sc.AgglomerativeClustering(
n_clusters=n_clusters)
model.fit(x)
score = sm.silhouette_score(
x, model.labels_, sample_size=len(x),
metric='euclidean')
scores.append(score)
models.append(model)
pred_y = models[np.array(scores).argmax()].labels_
mp.figure('Agglomerative Cluster',
facecolor='lightgray')
mp.title('Agglomerative Cluster', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(x[:, 0], x[:, 1], c=pred_y, cmap='brg',
s=60)
mp.show()

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.cluster as sc
import sklearn.neighbors as nb
import matplotlib.pyplot as mp
n_samples = 500
t = 2.5 * np.pi * (1 + 2 * np.random.rand(
n_samples, 1))
x = 0.05 * t * np.cos(t)
y = 0.05 * t * np.sin(t)
n = 0.05 * np.random.rand(n_samples, 2)
x = np.hstack((x, y)) + n
model = sc.AgglomerativeClustering(
pred_y1 = model.fit_predict(x)
model = sc.AgglomerativeClustering(
connectivity=nb.kneighbors_graph(
x, 10, include_self=False))
pred_y2 = model.fit_predict(x)
mp.figure('Agglomerative Cluster 1',
facecolor='lightgray')
mp.title('Agglomerative Cluster 1', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.axis('equal')
mp.scatter(x[:, 0], x[:, 1], c=pred_y1, cmap='brg',
alpha=0.5, s=60)
mp.figure('Agglomerative Cluster 2',
facecolor='lightgray')
mp.title('Agglomerative Cluster 2', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.axis('equal')
mp.scatter(x[:, 0], x[:, 1], c=pred_y2, cmap='brg',
alpha=0.5, s=60)
mp.show()

### 5.聚类的性能指标

a：该样本与同聚类的其它样本的平均距离，即内部距离。

b：该样本与离它所属聚类最接近的另一个聚类中各样本的平均距离，即外部距离。

->-1：错误聚类

->1：理想聚类

->0：聚类重叠

sm.silhouette_score(输入, 输出, sample_size=样本数,

metric='euclidean')->轮廓系数得分

### 6.噪声密度(DBSCAN)算法

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import numpy as np
import sklearn.cluster as sc
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x = []
with open('../../data/perf.txt', 'r') as f:
data = [float(substr) for substr
in line.split(',')]
x.append(data)
x = np.array(x)
epsilons, scores, models = \
np.linspace(0.3, 1.2, 10), [], []
for epsilon in epsilons:
# eps - 朋友圈半径
# min_samples - 聚类规模最小阈值
model = sc.DBSCAN(eps=epsilon, min_samples=5)
model.fit(x)
score = sm.silhouette_score(
x, model.labels_, sample_size=len(x),
metric='euclidean')
scores.append(score)
models.append(model)
best_model = models[np.array(scores).argmax()]
pred_y = best_model.labels_
mp.figure('DBSCAN Cluster', facecolor='lightgray')
mp.title('DBSCAN Cluster', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
labels = set(pred_y)
#   0   1   2   3   4   5
# |---|---|---|---|---|---|
# B-----------R-----------G
cs = mp.get_cmap('brg', len(labels))(
range(len(labels)))
s=60, label='Core')
facecolor='none',
s=60, label='Periphery')
marker='x',
s=60, label='Offset')
mp.legend()
mp.show()

©️2019 CSDN 皮肤主题: 黑客帝国 设计师: 上身试试