1.密度聚类
# 密度聚类
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as ds
import matplotlib.colors
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
def expand(a, b):
d = (b - a) * 0.1
return a-d, b+d
if __name__ == "__main__":
N = 1000
centers = [[1, 2], [-1, -1], [1, -1], [-1, 1]]
# ds.make_blobs为了生成数据集
# n_features表示每一个样本有多少特征值
# n_samples表示样本的个数
# centers是聚类中心点的个数,可以理解为label的种类数
# random_state是随机种子,可以固定生成的数据
# cluster_std设置每个类别的方差
data, y = ds.make_blobs(N, n_features=2, centers=centers, cluster_std=[0.5, 0.25, 0.7, 0.5], random_state=0)
# 数据预处理-标准化
print(data)
data = StandardScaler().fit_transform(data)
# 数据1的参数:(epsilon, min_sample)
params = ((0.2, 5), (0.2, 10), (0.2, 15), (0.3, 5), (0.3, 10), (0.3, 15))
matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(12, 8), facecolor='w')
plt.suptitle(u'DBSCAN聚类', fontsize=20)
for i in range(6):
eps, min_samples = params[i]
model = DBSCAN(eps=eps, min_samples=min_samples)
model.fit(data)
# 预测值
y_hat = model.labels_
core_indices = np.zeros_like(y_hat, dtype=bool)
core_indices[model.core_sample_indices_] = True
# 对于一维数组或者列表,unique函数去除其中重复的元素,并按元素由大到小返回一个新的无元素重复的元组或者列表
y_unique = np.unique(y_hat)
n_clusters = y_unique.size - (1 if -1 in y_hat else 0)
print(y_unique, '聚类簇的个数为:', n_clusters)
# 颜色渐变
# clrs = []
# for c in np.linspace(16711680, 255, y_unique.size):
# clrs.append('#%06x' % c)
plt.subplot(2, 3, i+1)
clrs = plt.cm.Spectral(np.linspace(0, 0.8, y_unique.size))
print(clrs)
for k, clr in zip(y_unique, clrs):
cur = (y_hat == k)
if k == -1:
plt.scatter(data[cur, 0], data[cur, 1], s=20, c='k')
continue
plt.scatter(data[cur, 0], data[cur, 1], s=30, c=clr, edgecolors='k')
plt.scatter(data[cur & core_indices][:, 0], data[cur & core_indices][:, 1], s=60, c=clr, marker='o', edgecolors='k')
x1_min, x2_min = np.min(data, axis=0)
x1_max, x2_max = np.max(data, axis=0)
x1_min, x1_max = expand(x1_min, x1_max)
x2_min, x2_max = expand(x2_min, x2_max)
plt.xlim((x1_min, x1_max))
plt.ylim((x2_min, x2_max))
plt.grid(True)
plt.title('$\epsilon$ = %.1f m = %d,聚类数目:%d' % (eps, min_samples, n_clusters), fontsize=16)
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.show()
[-1 0 1 2 3] 聚类簇的个数为: 4
[[0.61960784 0.00392157 0.25882353 1. ]
[0.95686275 0.42745098 0.2627451 1. ]
[0.99607843 0.87843137 0.54509804 1. ]
[0.90196078 0.96078431 0.59607843 1. ]
[0.4 0.76078431 0.64705882 1. ]]
[-1 0 1 2 3 4] 聚类簇的个数为: 5
[[0.61960784 0.00392157 0.25882353 1. ]
[0.90442138 0.3479431 0.28304498 1. ]
[0.9928489 0.71695502 0.4094579 1. ]
[0.99915417 0.97377932 0.70503652 1. ]
[0.81122645 0.92387543 0.61453287 1. ]
[0.4 0.76078431 0.64705882 1. ]]
[-1 0] 聚类簇的个数为: 1
[[0.61960784 0.00392157 0.25882353 1. ]
[0.4 0.76078431 0.64705882 1. ]]
[-1 0 1] 聚类簇的个数为: 2
[[0.61960784 0.00392157 0.25882353 1. ]
[0.99607843 0.87843137 0.54509804 1. ]
[0.4 0.76078431 0.64705882 1. ]]
[-1 0 1 2 3] 聚类簇的个数为: 4
[[0.61960784 0.00392157 0.25882353 1. ]
[0.95686275 0.42745098 0.2627451 1. ]
[0.99607843 0.87843137 0.54509804 1. ]
[0.90196078 0.96078431 0.59607843 1. ]
[0.4 0.76078431 0.64705882 1. ]]
2.
from PIL import Image
import numpy as np
from sklearn.cluster import KMeans
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
def restore_image(cb, cluster, shape):
row, col, dummy = shape
image = np.empty((row, col, 3))
index = 0
for r in range(row):
for c in range(col):
image[r, c] = cb[cluster[index]]
index += 1
return image
def show_scatter(a):
N = 10
print('原始数据:\n', a)
density, edges = np.histogramdd(a, bins=[N,N,N], range=[(0,1), (0,1), (0,1)])
density /= density.max()
x = y = z = np.arange(N)
d = np.meshgrid(x, y, z)
fig = plt.figure(1, facecolor='w')
ax = fig.add_subplot(111, projection='3d')
ax.scatter(d[1], d[0], d[2], c='r', s=100*density, marker='o', depthshade=True)
ax.set_xlabel(u'红色分量')
ax.set_ylabel(u'绿色分量')
ax.set_zlabel(u'蓝色分量')
plt.title(u'图像颜色三维频数分布', fontsize=20)
plt.figure(2, facecolor='w')
den = density[density > 0]
den = np.sort(den)[::-1]
t = np.arange(len(den))
plt.plot(t, den, 'r-', t, den, 'go', lw=2)
plt.title(u'图像颜色频数分布', fontsize=18)
plt.grid(True)
plt.show()
if __name__ == '__main__':
matplotlib.rcParams['font.sans-serif'] = [u'SimHei']
matplotlib.rcParams['axes.unicode_minus'] = False
num_vq = 60
im = Image.open('flower2.png') # son.bmp(100)/flower2.png(200)/son.png(60)/lena.png(50)
# 保证数据都在0-1
image = np.array(im).astype(np.float) / 255
# 保证都有3个通道
image = image[:, :, :3]
image_v = image.reshape((-1, 3))
show_scatter(image_v)
N = image_v.shape[0] # 图像像素总数
# 选择足够多的样本(如1000个),计算聚类中心
idx = np.random.randint(0, N, size=1000)
image_sample = image_v[idx]
model = KMeans(num_vq)
model.fit(image_sample)
c = model.predict(image_v) # 聚类结果
print('聚类结果:\n', c)
print('聚类中心:\n', model.cluster_centers_)
plt.figure(figsize=(15, 8), facecolor='w')
plt.subplot(121)
plt.axis('off')
plt.title(u'原始图片', fontsize=18)
plt.imshow(image)
# plt.savefig('1.png')
plt.subplot(122)
vq_image = restore_image(model.cluster_centers_, c, image.shape)
plt.axis('off')
plt.title(u'矢量量化后图片:%d色' % num_vq, fontsize=20)
plt.imshow(vq_image)
# plt.savefig('2.png')
plt.tight_layout(1.2)
plt.show()
原始数据:
[[0.29411765 0.50588235 0.12941176]
[0.30196078 0.51372549 0.1372549 ]
[0.30588235 0.51764706 0.14117647]
...
[0.09803922 0.35686275 0.01176471]
[0.08627451 0.34117647 0.01176471]
[0.07058824 0.3254902 0. ]]
聚类结果:
[23 23 23 ... 10 57 57]
聚类中心:
[[0.14771242 0.01699346 0.44509804]
[0.45139319 0.65634675 0.30443756]
[0.22207698 0.44560639 0.09600581]
[0.71279178 0.55406162 0.99010271]
[0.3874183 0.24771242 0.76176471]
[0.74154995 0.66610644 0.50289449]
[0.02337255 0.21396078 0.00298039]
[0.87098039 0.6927451 0.99647059]
[0.38879552 0.61045752 0.24014939]
[0.2611578 0.51951447 0.00737628]
[0.11184314 0.3732549 0.00376471]
[0.90653595 0.83594771 0.66601307]
[0.439819 0.62594268 0.05339367]
[0.70588235 0.84607843 0.46176471]
[0.5054902 0.36862745 0.85411765]
[0.52254902 0.50784314 0.32892157]
[0.28872549 0.15392157 0.60294118]
[0.01535948 0.07320261 0.0120915 ]
[0.46117647 0.35058824 0.43058824]
[0.3627451 0.61336898 0.00641711]
[0.78431373 0.62913165 0.98804855]
[0.62254902 0.78431373 0.10980392]
[0.55098039 0.56862745 0.66470588]
[0.27598039 0.50906863 0.12598039]
[0.82712418 0.74183007 0.57418301]
[0.96176471 0.92843137 0.98921569]
[0.05709343 0.28512111 0.00253749]
[0.34215686 0.27418301 0.62254902]
[0.61568627 0.77039216 0.41705882]
[0.44989107 0.32440087 0.78453159]
[0.15869281 0.42474946 0.00522876]
[0.56339869 0.41525054 0.92374728]
[0.74901961 0.77703081 0.92156863]
[0.77921569 0.89019608 0.52941176]
[0.30292117 0.55910364 0.00792317]
[0.15163399 0.1379085 0.02156863]
[0.15042017 0.28459384 0.03361345]
[0.56862745 0.63372549 0.24078431]
[0.42009804 0.27696078 0.87303922]
[0.23398693 0.09782135 0.53834423]
[0.22352941 0.35098039 0.23333333]
[0.91437908 0.78366013 0.99444444]
[0.47058824 0.47607843 0.2054902 ]
[0.32941176 0.1795207 0.71089325]
[0.67736185 0.59215686 0.43707665]
[0.17670127 0.37600923 0.03171857]
[0.55238095 0.70644258 0.33697479]
[0.5 0.65980392 0.17009804]
[0.07712418 0.01568627 0.18562092]
[0.31421569 0.35833333 0.08333333]
[0.52091503 0.69803922 0.02875817]
[0.58954248 0.53071895 0.81960784]
[0.62229102 0.47389061 0.95190918]
[0.29934641 0.18039216 0.36993464]
[0.21597322 0.46791009 0.00966045]
[0.59383754 0.5719888 0.43137255]
[0.33411765 0.53427451 0.19701961]
[0.0951634 0.33071895 0.00392157]
[0.38966132 0.5885918 0.14153298]
[0.48954248 0.41045752 0.62287582]]