一 kmeans算法进行图片聚类
思路: 每一个像素都是1x3,对这些1x3的像素进行聚类,相同一类的用中心点的像素进行替代,从结果来看,聚成60个类没什么差别
def restore_image(centers, labels, shape):
row, col, n = shape
image = np.empty((row, col, n))
index = 0
for i in range(row):
for j in range(col):
image[i, j] = centers[labels[index]]
index += 1
return image
def test_picture_cluster():
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
num_vq = 5
im = Image.open('flower2.png') # son.bmp(100)/flower2.png(200)/son.png(60)/lena.png(50)
image = np.array(im).astype(np.float) / 255
# plt.imshow(image)
# plt.show()
# plt.close()
image = image[:, :, :3]
image_v = image.reshape(-1, 3) # 把图片转换成NX3
N = image_v.shape[0] # 图片的像素
clf = KMeans(n_clusters=num_vq, init='k-means++')
idx = np.random.randint(0, N, 1000) # 取1000个像素进行训练
image_sample = image_v[idx]
clf.fit(image_sample)
image_predict = clf.predict(image_v)
image_show = restore_image(clf.cluster_centers_, image_predict, image.shape)
fig = plt.figure()
plt.subplot(121)
plt.imshow(image)
plt.title('original')
plt.subplot(122)
plt.imshow(image_show)
plt.title('cluster')
plt.show()
plt.close()
二
\ AP算法 吸引力传播
preference: 作为中心的参考程度
affinity:亲和力计算的方法,欧几里得的负平方,因此在参考中位数时,也用了负的中位数
阻尼系数:
AP算法为选择合适的聚类中心需要不断的从数据点中搜集两方面的证据:候选聚类中心x(k)对任一数据点x(i)的吸引度信息r(i,k)和数据点x(i)选择候选聚类中心x(k)的归属度信息a(i,k)。
总的来说:AP算法是一种不断凝聚到中心的算法,可以不指定中心点的个数。缺点:计算复杂度大
N = 400
centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)
m = euclidean_distances(data, squared=True) # 计算两两之间距离
perference = -np.median(m) # 不指定轴的中位数,把所有值拉成一排,取中间值
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(12, 9), facecolor='w')
fig = plt.figure(figsize=(15, 9))
for index, mul in enumerate(np.linspace(1, 4, 9)):
p = perference * mul
clf = AffinityPropagation(affinity='euclidean', preference=p)
y_predict = clf.fit_predict(data)
plt.subplot(3, 3, index+1)
plt.scatter(data[:, 0], data[:, 1], c=y_predict)
plt.title('perference is {}'.format(p))
plt.show()
plt.close()
三 meanshift
均值漂移,个人理解为想着密度大的地方进行转移, 直到方向不再发生巨大变化
fig2 = plt.figure(figsize=(15, 9))
for index, mul in enumerate(np.linspace(0.1, 0.4, 9)):
band_width = -perference * mul
clf = MeanShift(bin_seeding=True, bandwidth=band_width)
y_predict = clf.fit_predict(data)
print('中心点的个数', len(clf.cluster_centers_))
plt.subplot(3, 3, index + 1)
plt.scatter(data[:, 0], data[:, 1], c=y_predict)
plt.title('Meanshift perference is {}'.format(band_width))
plt.show()
plt.close()
mean_shift
四 层次聚类
伪代码
假设有N个待聚类的样本,对于层次聚类来说,步骤:
1.(初始化)把每个样本归为一类,计算每两个类之间的距离,也就是样本与样本之间的相似度;
2.寻找各个类之间最近的两个类,将他们归为一类(类总数减少一个)
3. 重新计算新生成的这个类与各个旧类之间的相似度;
4.重复2和3直到所有的样本点都归为一类,结束。
warnings.filterwarnings(action='ignore', category=UserWarning)
np.set_printoptions(suppress=True)
np.random.seed(0)
n_clusters = 4
N = 400
data1, y1 = ds.make_blobs(n_samples=N, n_features=2, centers=((-1, 1), (1, 1), (1, -1), (-1, -1)),
cluster_std=(0.1, 0.2, 0.3, 0.4), random_state=0)
data1 = np.array(data1)
n_noise = int(0.1 * N)
r = np.random.rand(n_noise, 2)
data_min1, data_min2 = np.min(data1, axis=0)
data_max1, data_max2 = np.max(data1, axis=0)
r[:, 0] = r[:, 0] * (data_max1 - data_min1) + data_min1
r[:, 1] = r[:, 1] * (data_max2 - data_min2) + data_min2
data1_noise = np.concatenate((data1, r), axis=0)
y1_noise = np.concatenate((y1, [4] * n_noise))
data2, y2 = ds.make_moons(n_samples=N, noise=.05)
data2 = np.array(data2)
n_noise = int(0.1 * N)
r = np.random.rand(n_noise, 2)
data_min1, data_min2 = np.min(data2, axis=0)
data_max1, data_max2 = np.max(data2, axis=0)
r[:, 0] = r[:, 0] * (data_max1 - data_min1) + data_min1
r[:, 1] = r[:, 1] * (data_max2 - data_min2) + data_min2
data2_noise = np.concatenate((data2, r), axis=0)
y2_noise = np.concatenate((y2, [3] * n_noise))
linkages = ["ward", "complete", "average", "single"]
i = 1
fig = plt.figure(figsize=(15, 9))
for index, (n_cluster, data, y_label) in enumerate([(4, data1, y1), (4, data1_noise, y1_noise), (2, data2, y2),
(2, data2_noise, y2_noise)]):
plt.subplot(4, 2, 2*index+1)
plt.scatter(data[:, 0], data[:, 1], c=y_label)
plt.grid(b=True, ls=':')
connectivity = kneighbors_graph(data, n_neighbors=7, mode='distance', metric='minkowski', p=2, include_self=True)
connectivity = 0.5*(connectivity + connectivity.T)
model = AgglomerativeClustering(n_clusters=n_cluster, affinity='euclidean', connectivity=connectivity, linkage=linkages[i])
y_predict = model.fit_predict(data)
plt.subplot(4, 2, 2*index+2)
plt.grid(b=True, ls=':')
plt.scatter(data[:, 0], data[:, 1], c=y_predict)
plt.show()
plt.close()
五 谱聚类
谱聚类算法
matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
t = np.arange(0, 2 * np.pi, 0.1)
data1 = np.vstack((np.cos(t), np.sin(t))).T
data2 = np.vstack((2 * np.cos(t), 2 * np.sin(t))).T
data3 = np.vstack((3 * np.cos(t), 3 * np.sin(t))).T
data = np.vstack((data1, data2, data3))
edu = euclidean_distances(data, squared=True)
sigma = np.median(edu)
n_clusters = 3
plt.figure(figsize=(12, 8), facecolor='w')
plt.suptitle(u'谱聚类', fontsize=20)
clrs = plt.cm.Spectral(np.linspace(0, 0.8, n_clusters)) # 给不同的类别不同的颜色
fig = plt.figure(figsize=(15, 9))
for index, s in enumerate(np.logspace(-2, 0, 4)): # 通过取不同的sigma,进行调参
af = np.exp(-edu**2 / (2*(s**2))) # 定义求权重的方法,其中data就是|xi-xj|^2, 再定义一下exp进行高斯相似度计算
y_hat = spectral_clustering(af, n_clusters=n_clusters, assign_labels='kmeans', random_state=1)
plt.subplot(4, 1, index + 1)
plt.scatter(data[:, 0], data[:, 1], c=y_hat)
plt.title('sigma is {}'.format(s))
plt.show()
plt.close()