简介
本文是吴恩达《机器学习》系列课程习题七的python解答。习题七是无监督学习,应用K-means进行图片压缩,并实践PCA主成分分析。
K-means代码
实现K-means聚类,并使用它来压缩图像。 我们将从一个简单的2D数据集开始,以了解K-means是如何工作的,然后我们将其应用于图像压缩。
import numpy as np
import pandas as pd
from scipy.io import loadmat
import seaborn as sb
import matplotlib.pyplot as plt
import sys
# cluster data
def find_closest_centroids(X,centroids):
m = X.shape[0]
k = len(centroids)
res = np.zeros(m,dtype=int)
for i in range(m):
min_dis = sys.maxsize
for j in range(k):
dis = np.sum(np.square(X[i,:] - centroids[j]))
if dis < min_dis:
min_dis = dis
res[i] = j
return res
data = loadmat('../data_sets/ex7data2.mat')
# plot original data
df = pd.DataFrame(data.get('X'),columns=['x1','x2'])
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(df['x1'],df['x2'],label='ex7data2')
ax.legend
plt.show
init_centroids = [[3, 3], [6, 2], [8, 5]]
res = find_closest_centroids(data.get('X'),init_centroids)
def compute_new_centroids(index,X,k):
count = np.zeros(k)
sum = {}
for i in range(k):
sum[i] = np.zeros(X.shape[1]).reshape(1,X.shape[1])
for i in range(len(index)):
sum[index[i]] += X[i,:]
count[index[i]] = count[index[i]] + 1
centroids = np.zeros((k,X.shape[1]))
for i in range(k):
centroids[i,:] = (sum[i]/count[i])
return centroids
centroids = compute_new_centroids(res,data.get('X'),3)
centroids
# array([[2.42830111, 3.15792418], [5.81350331, 2.63365645], [7.11938687, 3.6166844 ]])
init_centroids = df.sample(3)
def compare_lists(l1,l2):
if len(l1) != len(l2):
return False
else:
for i in range(len(l1)):
e1 = l1[i]
e2 = l2[i]
for j in range(len(e1)):
if e1[j] != e2[j]:
return False
return True
# iterate until the centroids do not change
def kmeans(X,centroids,k):
dis = np.zeros(X.shape[0],dtype=int)
while(True):
current_dis = find_closest_centroids(X,centroids)
next_centroids = compute_new_centroids(current_dis,X,k)
if compare_lists(next_centroids,centroids):
dis = current_dis
break
else:
centroids = next_centroids
loss = 0
for i in range(len(dis)):
loss+= np.sum(np.square(X[i,:] - centroids[dis[i]]))
loss = loss/len(dis)
return dis, centroids, loss
centroids = init_centroids.values.tolist()
cluster,centroids,loss = kmeans(np.matrix(df),centroids,3)
X = data.get('X')
cluster1 = X[np.where(cluster == 0)[0],:]
cluster2 = X[np.where(cluster == 1)[0],:]
cluster3 = X[np.where(cluster == 2)[0],:]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(cluster1[:,0], cluster1[:,1], s=5, color='r', label='Cluster 1')
ax.scatter(cluster2[:,0], cluster2[:,1], s=5, color='g', label='Cluster 2')
ax.scatter(cluster3[:,0], cluster3[:,1], s=5, color='b', label='Cluster 3')
ax.scatter(centroids[:,0], centroids[:,1], s=15, color='y', marker='X',label='Centroids')
ax.legend()
plt.show()
可见模型清楚的将数据区分开了。接下来将k-means应用于图片压缩。核心思想是用簇中心节点代替簇中的所有点。
from IPython.display import Image
Image(filename='../data_sets/bird_small.png')
image_data = loadmat('../data_sets/bird_small.mat')
A = image_data.get('A')
A.shape
# (128, 128, 3)
A = np.reshape(A,(A.shape[0]*A.shape[1],A.shape[2]))
A.shape
# (16384, 3)
# apply elbow method to decide k
k_arr = range(1,15,2)
loss_arr = []
for k in k_arr:
centroids = A[np.random.choice(A.shape[0],k)]
idx,centroids,loss = kmeans(A,centroids,k)
loss_arr.append(loss)
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(k_arr,loss_arr,label='loss')
ax.legend()
plt.show()
idx,centroids,loss = kmeans(A,A[np.random.choice(A.shape[0],5)],5)
idx,centroids,loss
# (array([4, 4, 4, ..., 0, 0, 0]),
# array([[ 31.00135606, 31.82874855, 29.45176288],
# [109.80113636, 92.46519886, 81.09090909],
# [186.44011849, 137.21074905, 72.9813796 ],
# [238.53100159, 224.93441971, 191.54173291],
# [200.65381344, 172.38984973, 130.76552311]]),
# 1509.393693869752)
recoverd_pic = centroids[idx,:]
recoverd_pic = recoverd_pic/255
plt.imshow(recoverd_pic.reshape(128,128,3))
从图片可以看出,大部分的特征还是得到了保留。我们调整一下k来看一下效果
from sklearn.cluster import KMeans
model = KMeans(n_clusters=16, n_init=10)
model.fit(A)
centroids = model.cluster_centers_
C = model.predict(data)
compress_pic = centroids[C].reshape(128,128,3)/255
plt.imshow(compress_pic)
可以看到K增大可以复原更多的信息,本例中极限情况下(k = 16384)得到的图像将会和原来一样,因为每一个点都是簇中心点。
PCA
data = loadmat('../data_sets/ex7data1.mat')
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(data.get('X')[:,0],data.get('X')[:,1],label='ex7data1')
ax.legend()
plt.show()
# PCA assume features have similar scales.
df = pd.DataFrame(data.get('X'),columns=['x1','x2'])
df = (df - df.mean(axis=0))/df.std(axis=0)
X = np.matrix(df)
cov = (X.T * X)/X.shape[0]
cov
# matrix([[0.98 , 0.72081977],
# [0.72081977, 0.98 ]])
U, S, V = np.linalg.svd(cov)
U, S, V
# (matrix([[-0.70710678, -0.70710678],
# [-0.70710678, 0.70710678]]),
# array([1.70081977, 0.25918023]),
# matrix([[-0.70710678, -0.70710678],
# [-0.70710678, 0.70710678]]))
# np.sum(S[:k])/np.sum(S) should be reasonalably close to 0.99
def project_data(X, U, k):
U_reduced = U[:,:k]
return np.dot(X, U_reduced)
def recover_data(Z, U, k):
U_reduced = U[:,:k]
return Z * U_reduced.T
Z = project_data(X, U, 1)
X_recovered = recover_data(Z, U, 1)
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(list(X_recovered[:,0]),list(X_recovered[:,1]),label='PCA')
ax.scatter(list(X[:,0]),list(X[:,1]),label='original')
ax.legend()
plt.show()
faces = loadmat('../data_sets/ex7faces.mat')
io.imshow(np.reshape(faces.get('X')[3,:],(32,32)).T)
faces.get('X').shape
# (5000, 1024)
X = np.matrix(faces.get('X'))
cov = (X.T * X)/X.shape[0]
U, S, V = np.linalg.svd(cov)
k=0
sum = 0
total = np.sum(S)
while(sum<0.99):
sum += S[k]/total
k+=1
k
# 335
compress_faces = project_data(X, U, k)
recover_faces = recover_data(compress_faces, U, k)
recover_faces = np.array(recover_faces)
io.imshow(recover_faces[3,:].reshape(32,32).T)
如此,我们便将最具有意义的feature过滤出来,对数据集实现了降维。
数据集
链接: https://pan.baidu.com/s/1zteJBsMJ0GRwqRb5opOgwg 提取码: 78ah