一、两种数据压缩算法
1.Kmeans聚类
算法思路:
步骤:
- 确定聚类的种类K。
- 随机确定K个聚类中心。
- 遍历数据集,计算每一个数据到K聚类的距离,选择距离最小的聚类中心,打上对应的标签。
- 对同一标签的数据计算中心坐标,成为新的K聚类中心。
- 重复3、4步骤,知道聚类的中心不再改变。
2.PCA
算法思路:
步骤
- 对数据集标准化处理。
- 求出 Σ。 Σ = X.T * X / m
- 求出 u,s。 u,s, v = svd(Σ)
- 确定最终所需要的维度K。降维数据Z = 原始数据X * u的前K列向量。
- 数据恢复。 恢复数据X '= 降维数据Z * u的前K列向量的转置。
二、python实现
1.Kmeans聚类实现
例子 1:聚类散点
# 加载数据
path = r'ex7data2.mat'
data = scio.loadmat(path)
X = data['X']
# 指定并初始化中心坐标
initial_centroid = np.array([[3, 3], [6, 2], [8, 5]])
# 对每个数据集里面的坐标进行分类
index = find_close_centroid(X, initial_centroid)
# 对每个类集合的坐标集求中心坐标
centroid = compute_centroids(X, index, initial_centroid)
# Kmeans过程可视化
# plot_progress_kmeans(X, index, initial_centroid, 10)
# 随机初始化每个聚类的中心坐标
initial_centroid = kmeans_init_centroid(X, 3)
给每个数据加标签的实现函数:
import numpy as np
def find_close_centroid(x, centroid):
m = x.shape[0]
k = centroid.shape[0]
close_index = np.zeros([m, 1])
for i in range(m):
close_distance = 10000000
for j in range(k):
distance = np.sum(np.power(x[i, :] - centroid[j, :], 2))
if distance < close_distance:
close_distance = distance
close_index[i, :] = j + 1
return close_index
根据分好类的数据重新计算中心的函数:
def compute_centroids(x, index, cetroid):
k = cetroid.shape[0]
mean_index = np.zeros([cetroid.shape[0], cetroid.shape[1]])
for i in range(k):
index_class = np.where(index == i + 1)[0]
Ck = index_class.size
mean_index[i, :] = np.mean(x[index_class, :], axis=0)
return mean_index
随机初始化中心坐标:
import numpy as np
def kmeans_init_centroid(x, k):
randindex = np.random.permutation(x.shape[0])
initial_centroid = x[randindex[0:k], :]
return initial_centroid
迭代过程可视化函数实现:
import numpy as np
import matplotlib.pyplot as plt
from computeCentroids import compute_centroids
from findCloseCentroid import find_close_centroid
def plot_progress_kmeans(x, index, centroid, maxinter):
for j in range(maxinter):
# 绘制类1的点集合:
index_class1 = np.where(index == 1)[0]
plt.scatter(x[index_class1, 0], x[index_class1, 1], c='r', marker='o')
# 绘制类2的点集合:
index_class2 = np.where(index == 2)[0]
plt.scatter(x[index_class2, 0], x[index_class2, 1], c='g', marker='o')
# 绘制类3的点集合:
index_class3 = np.where(index == 3)[0]
plt.scatter(x[index_class3, 0], x[index_class3, 1], c='y', marker='o')
plt.title('Total interation: 10 Now Interation : %d' % int(j + 1))
plt.show()
# 重新计算每一类的中心坐标、对数据集分类
centroid = compute_centroids(x, index, centroid)
index = find_close_centroid(x, centroid)
聚类过程:
第一次迭代:
第二次迭代:
第五次迭代:
…第十次迭代:
例子2:聚类图像的颜色
# 加载数据
data = scio.loadmat(r'bird_small.mat')
A = data['A']
# reshape A 成为二维数据的格式
m, n, h = A.shape
A = A.reshape(m*n, h)
# 压缩图像并显示
initial_centroid = kmeans_init_centroid(A, 16)
index = find_close_centroid(A, initial_centroid)
res_centroid, res_index = get_result_kmeans(A, index, initial_centroid, 10)
A_compress = compress_picture(A, res_index, res_centroid, m, n, h)
得到最后聚类结果的函数实现;
from computeCentroids import compute_centroids
from findCloseCentroid import find_close_centroid
def get_result_kmeans(x, index, centroid, maxinter):
for j in range(maxinter):
# 重新计算每一类的中心坐标、对数据集分类
centroid = compute_centroids(x, index, centroid)
index = find_close_centroid(x, centroid)
print('Training.........')
return centroid, index
显示压缩图像实现:
import numpy as np
import matplotlib.pyplot as plt
def compress_picture(x, index, centroid, m, n, h):
centroid = np.trunc(centroid).astype(int)
k = centroid.shape[0]
# 显示原图像
x0 = x.reshape(m, n, h)
plt.figure(1)
plt.imshow(x0)
plt.axis('off')
plt.title('Oringinal Picture')
plt.show()
# Kmeans压缩过程
for i in range(k):
index_temp = np.where(index == i + 1)[0]
x[index_temp, :] = centroid[i, :]
# 显示压缩图像
x1 = x.reshape(m, n, h)
plt.figure(2)
plt.imshow(x1)
plt.axis('off')
plt.title('Compress Picture')
plt.show()
结果:
原始图像:
压缩图像:
2.PCA 实现
例子1:将二维数据降维成一维数据
# 加载数据
data = scio.loadmat(r'ex7data1.mat')
X = data['X']
# 数据可视化
# plt.scatter(X[:, 0], X[:, 1], c='b', marker='o')
# plt.show()
# 数据标准化处理
x_norm = feature_normalize(X)
u, s = pca(x_norm)
print(u[0, 0], u[1, 0])
# 降为一维数据
K = 1
Z = project_data(x_norm, u, K)
print(Z[0, 0])
# 恢复数据
x_recov = recovery_data(Z, u, K)
print(x_recov[0, :])
# 比较数据
plt.scatter(x_norm[:, 0], x_norm[:, 1], c='r', marker='o')
plt.scatter(x_recov[:, 0], x_recov[:, 1], c='b', marker='o')
plt.show()
标准化函数实现:
import numpy as np
def feature_normalize(x):
m, n = x.shape
x_norm = np.zeros([m, n])
mu = np.mean(x, axis=0)
sigma = np.array(np.std(x, axis=0)).reshape(1, n)
x_norm = np.multiply(x-mu, 1 / sigma)
return x_norm
pca函数实现;
import numpy as np
from numpy import linalg
def pca(x):
m = x.shape[0]
Sigma = np.dot(x.T, x) / m
u, s, v = linalg.svd(Sigma)
return u, s
投影(降维)函数实现:
import numpy as np
def project_data(x, u, k):
return np.dot(x, u[:, 0:k])
恢复数据函数实现:
import numpy as np
def recovery_data(z, u, k):
return np.dot(z, u[:, 0:k].T)
运行结果:
例子2:人像特征降维
data = scio.loadmat(r'ex7faces.mat')
X = data['X']
displaydata(X[0:100, :])
# PCA处理并显示前36个主成成分向量
x_norm = feature_normalize(X)
u, s = pca(X)
displaydata(u[:, 0:36].T)
# 数据压缩
K = 100
Z = project_data(x_norm, u, K)
recv_X = recovery_data(Z, u, K)
displaydata(recv_X[0:100, :])
显示人像函数的实现:
import matplotlib.pyplot as plt
import numpy as np
def displaydata(x):
# 计算每个example的尺寸
m, n = x.shape
height = np.sqrt(n).astype(int)
width = (n / height).astype(int)
# 计算display example的行个数、列个数
rows = np.sqrt(m).astype(int)
cols = (m / rows).astype(int)
# 设置example之间的间隙大小
pad = 1
# 初始化display图像的矩阵
dis_mat = np.zeros((pad + rows*(pad + width), pad + cols*(pad + height)))
# 将x的每一行信息输入example矩阵中
for i in range(rows):
for j in range(cols):
max_val = x[i*rows + j, :].max()
temp = x[i*rows + j, :].reshape(width, height)
dis_mat[pad + i*(pad + height) + np.arange(height), pad + j*(pad + width) + np.arange(width)[:, np.newaxis]] = \
temp.reshape(width, height)
plt.figure()
plt.imshow(dis_mat, cmap='gray', extent=[-1, 1, -1, 1])
plt.axis('off')
plt.show()
结果:
人像特征:
pca压缩后再恢复的人像:
例子3 Kmeans聚类+PCA可视化图像数据降维
image = io.imread(r'bird_small.png')
image = img_as_float(image)
image = image.reshape(image.shape[0]*image.shape[1], image.shape[2])
K = 16
maxinter = 10
initial_cen = kmeans_init_centroid(image, K)
index = find_close_centroid(image, initial_cen)
cen_res, index_res = get_result_kmeans(image, index, initial_cen, maxinter)
# 随机选择1000个数据
select = np.random.randint(image.shape[0], size=1000)
# 绘制3D图像
cm = plt.cm.get_cmap('RdYlBu')
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(image[select, 0], image[select, 1], image[select, 2], c=index[select, 0].astype(np.float64), s=15, cmap=cm, vmin=0, vmax=K)
plt.title('Pixel dataset plotted in 3D. Color shows centroid memberships')
plt.show()
# PCA 处理
imgae_norm = feature_normalize(image)
u, s = pca(imgae_norm)
K = 2
Z = project_data(imgae_norm, u, K)
plt.scatter(Z[select, 0], Z[select, 1], c=index[select, 0].astype(np.float64), s=15, cmap=cm)
plt.show()
结果:
Kmeans处理图像数据:
PCA将三维数据降维成二维数据: