K-means
1、找到所属簇
def find_close_centroids(X, centroids):
res = np.zeros((1,))
for x in X:
res = np.append(res, np.argmin(np.sqrt(np.sum((centroids-x)**2, axis=1))))
return res[1:]
2、计算新的簇中心
def compute_centroids(X, idx):
K = int(np.max(idx))+1
m = X.shape[0]
n = X.shape[-1]
centroids = np.zeros((K, n))
counts = np.zeros((K, n))
for i in range(m):
centroids[int(idx[i])] += X[i]
counts[int(idx[i])] += 1
centroids = centroids / counts
return centroids
3、测试一下
data = sio.loadmat(文件路径)
X = data["X"] # (300,2)
init_centroids = np.array([[3, 3], [6, 2], [8, 5]])
idx = find_close_centroids(X, init_centroids) # (300,)
print(compute_centroids(X, idx))
4、随机选取K组数据为簇中心
def random_initialization(X, K):
res = np.zeros((1, X.shape[-1]))
m = X.shape[0]
rl = []
while True:
index = random.randint(0, m)
if index not in rl:
rl.append(index)
if len(rl) >= K:
break
for index in rl:
res = np.concatenate((res, X[index].reshape(1, -1)), axis=0)
return res[1:]
5、计算代价函数
def cost(X, idx, centroids):
c = 0
for i in range(len(X)):
c += np.sum((X[i] - centroids[int(idx[i])]) ** 2)
c /= len(X)
return c
6、k-means聚类算法
def k_means(X, K):
centroids = random_initialization(X, K)
centroids_all = [centroids]
idx = np.zeros((1,))
last_c = -1
now_c = -2
# 收敛时结束算法
while now_c != last_c:
idx = find_close_centroids(X, centroids)
last_c = now_c
now_c = cost(X, idx, centroids)
centroids = compute_centroids(X, idx)
centroids_all.append(centroids)
return idx, centroids_all
7、可视化聚类结果、簇中心移动过程
def visualizing(X, idx, centroids_all):
plt.scatter(X[..., 0], X[..., 1], c=idx)
xx = []
yy = []
for c in centroids_all:
xx.append(c[..., 0])
yy.append(c[..., 1])
plt.plot(xx, yy, 'rx--')
plt.show()
8、尝试一下
idx, centroids_all = k_means(X, 3)
visualizing(X, idx, centroids_all)
9、压缩图片
def compress(image, colors_num):
d1, d2, _ = image.shape
raw_image = image.reshape(d1*d2, -1)
idx, centroids_all = k_means(raw_image, colors_num)
colors = centroids_all[-1]
compress_image = np.zeros((1, 1))
for i in range(d1*d2):
compress_image = np.concatenate((compress_image, idx[i].reshape(1, -1)), axis=0)
compressed_image = compress_image[1:].reshape(d1, d2, -1)
return compressed_image, colors
10、将压缩后图像转换为可以正常显示的图片格式
def compressed_format_to_normal_format(compressed_image, colors):
d1, d2, _ = compressed_image.shape
normal_format_image = np.zeros((1, len(colors[0])))
compressed_image = compressed_image.reshape(d1*d2, -1)
for i in range(d1*d2):
normal_format_image = np.concatenate((normal_format_image, colors[int(compressed_image[i][0])].reshape(1, -1)), axis=0)
normal_format_image = normal_format_image[1:].reshape(d1, d2, -1)
return normal_format_image
11、测试一下
image = plt.imread(文件路径)
print(image.shape) # (128,128,3)
plt.subplot(1, 2, 1)
plt.imshow(image)
plt.axis("off")
plt.title("raw image")
plt.subplot(1, 2, 2)
compressed_image, colors = compress(image, 16)
print(compressed_image.shape, colors.shape) # (128, 128, 1) (16, 3)
plt.imshow(compressed_format_to_normal_format(compressed_image, colors))
plt.axis("off")
plt.title("compressed image")
plt.show()
降维
1、数据归一化
def data_process(X):
mean = np.mean(X, axis=0)
std = np.std(X, axis=0, ddof=1)
return (X-mean)/std, mean, std
2、PCA
def pca(X):
sigma = X.T.dot(X)/len(X) # (n,m)*(m,n) (n,n)
u, s, v = np.linalg.svd(sigma) # u(n,n) s(n, ) v(n,n)
return u, s, v
3、降维 X(m,n) U(n,n) X原始数据 U奇异值分解后的U K目标维度
def project_data(X, U, K):
return X.dot(U[..., :K])
4、数据升维 Z降维后数据 U奇异值分解后 K降维的维度
def reconstruct_data(Z, U, K):
return Z.dot(U[..., :K].T)
5、测试
data = sio.loadmat(文件路径)
X = data["X"] # (50,2)
normalized_X, _, _ = data_process(X)
u, _, _ = pca(normalized_X) # (2,2)
Z = project_data(normalized_X, u, 1)
print(Z.shape) # (50,1)
print(Z[0]) # [1.48127391]
rec_X = reconstruct_data(Z, u, 1)
print(rec_X.shape) # (50,2)
print(rec_X[0]) # [-1.04741883 -1.04741883]
plt.scatter(normalized_X[..., 0], normalized_X[..., 1], marker="x", c="b", label="normalize x")
plt.scatter(rec_X[..., 0], rec_X[..., 1], marker="x", c="r", label="reconstructed x")
plt.title("Visualizing the projections")
for i in range(len(normalized_X)):
plt.plot([normalized_X[i][0], rec_X[i][0]], [normalized_X[i][1], rec_X[i][1]], 'k--')
plt.legend()
plt.show()
6、脸部图像处理
# 可视化图片 d 一行展示的图片张数
def visualizing_images(X, d):
m = X.shape[0]
n = X.shape[-1]
s = int(np.sqrt(n))
for i in range(1, m+1):
plt.subplot(int(m/d), d, i)
plt.axis("off")
plt.imshow(X[i-1].reshape(s, s).T, cmap="Greys_r")
plt.show()
原始图
data = sio.loadmat(学习路径)
X = data["X"] # (5000,1024)
visualizing_images(X[:25], 5)
处理后的图
nor_X, _, _ = data_process(X)
u, _, _ = pca(nor_X)
Z = project_data(nor_X, u, 36)
rec_X = reconstruct_data(Z, u, 36)
visualizing_images(rec_X[:25], 5)