吴恩达《机器学习》ex7

简介

本文是吴恩达《机器学习》系列课程习题七的python解答。习题七是无监督学习,应用K-means进行图片压缩,并实践PCA主成分分析。

K-means代码

实现K-means聚类,并使用它来压缩图像。 我们将从一个简单的2D数据集开始,以了解K-means是如何工作的,然后我们将其应用于图像压缩。

import numpy as np
import pandas as pd
from scipy.io import loadmat
import seaborn as sb
import matplotlib.pyplot as plt
import sys

# cluster data
def find_closest_centroids(X,centroids):
    m = X.shape[0]
    k = len(centroids)
    res = np.zeros(m,dtype=int)
    for i in range(m):
        min_dis = sys.maxsize
        for j in range(k):
            dis = np.sum(np.square(X[i,:] - centroids[j]))
            if dis < min_dis:
                min_dis = dis
                res[i] = j
    return res
    
data = loadmat('../data_sets/ex7data2.mat')

# plot original data
df = pd.DataFrame(data.get('X'),columns=['x1','x2'])
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(df['x1'],df['x2'],label='ex7data2')
ax.legend
plt.show    

data1

init_centroids = [[3, 3], [6, 2], [8, 5]]
res = find_closest_centroids(data.get('X'),init_centroids)

def compute_new_centroids(index,X,k):
    count = np.zeros(k)
    sum = {}
    for i in range(k):
        sum[i] = np.zeros(X.shape[1]).reshape(1,X.shape[1])
    for i in range(len(index)):
        sum[index[i]] += X[i,:]
        count[index[i]] = count[index[i]] + 1
    centroids = np.zeros((k,X.shape[1]))
    for i in range(k):
        centroids[i,:] = (sum[i]/count[i])
    return centroids

centroids = compute_new_centroids(res,data.get('X'),3)
centroids
# array([[2.42830111, 3.15792418], [5.81350331, 2.63365645], [7.11938687, 3.6166844 ]])

init_centroids = df.sample(3)

def compare_lists(l1,l2):
    if len(l1) != len(l2):
        return False
    else:
        for i in range(len(l1)):
            e1 = l1[i]
            e2 = l2[i]
            for j in range(len(e1)):
                if e1[j] != e2[j]:
                    return False
        return True
# iterate until the centroids do not change
def kmeans(X,centroids,k):
    dis = np.zeros(X.shape[0],dtype=int)
    while(True):
        current_dis = find_closest_centroids(X,centroids)
        next_centroids = compute_new_centroids(current_dis,X,k)
        if compare_lists(next_centroids,centroids):
            dis = current_dis
            break
        else:
            centroids = next_centroids
    loss = 0
    for i in range(len(dis)):
        loss+= np.sum(np.square(X[i,:] - centroids[dis[i]]))
    loss = loss/len(dis)
    return dis, centroids, loss

centroids = init_centroids.values.tolist()
cluster,centroids,loss = kmeans(np.matrix(df),centroids,3)

X = data.get('X')
cluster1 = X[np.where(cluster == 0)[0],:]
cluster2 = X[np.where(cluster == 1)[0],:]
cluster3 = X[np.where(cluster == 2)[0],:]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(cluster1[:,0], cluster1[:,1], s=5, color='r', label='Cluster 1')
ax.scatter(cluster2[:,0], cluster2[:,1], s=5, color='g', label='Cluster 2')
ax.scatter(cluster3[:,0], cluster3[:,1], s=5, color='b', label='Cluster 3')
ax.scatter(centroids[:,0], centroids[:,1], s=15, color='y', marker='X',label='Centroids')
ax.legend()
plt.show()

clusters
可见模型清楚的将数据区分开了。接下来将k-means应用于图片压缩。核心思想是用簇中心节点代替簇中的所有点。

from IPython.display import Image
Image(filename='../data_sets/bird_small.png')

bird

image_data = loadmat('../data_sets/bird_small.mat')
A = image_data.get('A')
A.shape
# (128, 128, 3)

A = np.reshape(A,(A.shape[0]*A.shape[1],A.shape[2]))
A.shape
# (16384, 3)

# apply elbow method to decide k
k_arr = range(1,15,2)
loss_arr = []
for k in k_arr:
    centroids = A[np.random.choice(A.shape[0],k)]
    idx,centroids,loss = kmeans(A,centroids,k)
    loss_arr.append(loss)

fig, ax = plt.subplots(figsize=(12,8))
ax.plot(k_arr,loss_arr,label='loss')
ax.legend()
plt.show()

elbow

idx,centroids,loss = kmeans(A,A[np.random.choice(A.shape[0],5)],5)
idx,centroids,loss
# (array([4, 4, 4, ..., 0, 0, 0]),
# array([[ 31.00135606,  31.82874855,  29.45176288],
#        [109.80113636,  92.46519886,  81.09090909],
#        [186.44011849, 137.21074905,  72.9813796 ],
#        [238.53100159, 224.93441971, 191.54173291],
#        [200.65381344, 172.38984973, 130.76552311]]),
# 1509.393693869752)

recoverd_pic = centroids[idx,:]
recoverd_pic = recoverd_pic/255
plt.imshow(recoverd_pic.reshape(128,128,3))

在这里插入图片描述
从图片可以看出,大部分的特征还是得到了保留。我们调整一下k来看一下效果

from sklearn.cluster import KMeans

model = KMeans(n_clusters=16, n_init=10)
model.fit(A)
centroids = model.cluster_centers_
C = model.predict(data)
compress_pic = centroids[C].reshape(128,128,3)/255
plt.imshow(compress_pic)

在这里插入图片描述
可以看到K增大可以复原更多的信息,本例中极限情况下(k = 16384)得到的图像将会和原来一样,因为每一个点都是簇中心点。

PCA

data = loadmat('../data_sets/ex7data1.mat')
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(data.get('X')[:,0],data.get('X')[:,1],label='ex7data1')
ax.legend()
plt.show()

在这里插入图片描述

# PCA assume features have similar scales.
df = pd.DataFrame(data.get('X'),columns=['x1','x2'])
df = (df - df.mean(axis=0))/df.std(axis=0)
X = np.matrix(df)
cov = (X.T * X)/X.shape[0]
cov
# matrix([[0.98      , 0.72081977],
#        [0.72081977, 0.98      ]])

U, S, V = np.linalg.svd(cov)
U, S, V
# (matrix([[-0.70710678, -0.70710678],
#         [-0.70710678,  0.70710678]]),
# array([1.70081977, 0.25918023]),
# matrix([[-0.70710678, -0.70710678],
#         [-0.70710678,  0.70710678]]))

#  np.sum(S[:k])/np.sum(S) should be reasonalably close to 0.99
def project_data(X, U, k):
    U_reduced = U[:,:k]
    return np.dot(X, U_reduced)

def recover_data(Z, U, k):
    U_reduced = U[:,:k]
    return Z * U_reduced.T

Z = project_data(X, U, 1)
X_recovered = recover_data(Z, U, 1)
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(list(X_recovered[:,0]),list(X_recovered[:,1]),label='PCA')
ax.scatter(list(X[:,0]),list(X[:,1]),label='original')
ax.legend()
plt.show()

在这里插入图片描述

faces = loadmat('../data_sets/ex7faces.mat')
io.imshow(np.reshape(faces.get('X')[3,:],(32,32)).T)

在这里插入图片描述

faces.get('X').shape
# (5000, 1024)

X = np.matrix(faces.get('X'))
cov = (X.T * X)/X.shape[0]
U, S, V = np.linalg.svd(cov)

k=0
sum = 0
total = np.sum(S)
while(sum<0.99):
    sum += S[k]/total
    k+=1
k
# 335

compress_faces = project_data(X, U, k)
recover_faces = recover_data(compress_faces, U, k)
recover_faces = np.array(recover_faces)
io.imshow(recover_faces[3,:].reshape(32,32).T)

在这里插入图片描述
如此,我们便将最具有意义的feature过滤出来,对数据集实现了降维。

数据集

链接: https://pan.baidu.com/s/1zteJBsMJ0GRwqRb5opOgwg 提取码: 78ah

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值