吴恩达机器学习课程:编程练习 | (7) ex7-kmeans and PCA

1. kmeans算法

"""
案例1: 给定一个二维数据集,使用kmeans进行聚类
数据集:data/ex7data2.mat
"""

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from skimage import io


def find_centroids(X, centros): # 获取每个样本所属的类别
    idx = []
    for i in range(len(X)):
        # (2,)(k,2)->(k,2)
        dist = np.linalg.norm((X[i] - centros), axis=1)  # (k,) 每一个样本点到中心点的距离,axis=1按行向量处理.dist->(3,)
        id_i = np.argmin(dist)  # 保存样本点到三个中心点最小距离的索引值。--索引值共有0,1,2三个,即三类
        idx.append(id_i)
    return np.array(idx)  # 每一个样本点对应一个类别(k),idx是按样本点顺序对应的分类标签数组


def compute_centros(X, idx, k): # 计算聚类中心点
    centros = []

    for i in range(k):
        centros_i = np.mean(X[idx == i], axis=0) # 计算每类中心点坐标(x,y)--axis=0压缩行,对每列求平均
        centros.append(centros_i)

    return np.array(centros)


def run_kmeans(X, centros, iters):
    k = len(centros)
    centros_all = [centros]  # 用来存储每一次迭代更新的聚类中心点坐标(X,y)
    centros_i = centros
    for i in range(iters):
        idx = find_centroids(X, centros_i)
        centros_i = compute_centros(X, idx, k)
        centros_all.append(centros_i)

    return idx, np.array(centros_all)  # idx返回最后一次迭代更新的分类结果,centros_all聚类迭代过程中所有的中心点坐标


def plot_data(X, centros_all, idx): # 绘制数据集和聚类中心的移动轨迹
    plt.figure()
    plt.scatter(X[:, 0], X[:, 1], c=idx, cmap='rainbow')
    plt.plot(centros_all[:, :, 0], centros_all[:, :, 1], 'kx--')


def init_centros(X,k):  # 随机初始化k个聚类中心点
    index = np.random.choice(len(X),k)
    return X[index]


if __name__ == '__main__':

    data1 = sio.loadmat('data/ex7data2.mat')
    print(data1)
    X = data1['X']
    plt.scatter(X[:, 0], X[:, 1])
    # 人为初始化三个聚类中心点
    centros = np.array([[3, 3], [6, 2], [8, 5]]) # 初始化三个聚类中心点(在此认为选择了三个点),即分三类
    idx = find_centroids(X, centros)
    idx, centros_all = run_kmeans(X, centros, iters=10)
    plot_data(X, centros_all, idx)

    # 随机初始化三个聚类中心点
    for i in range(4):
        idx, centros_all = run_kmeans(X, init_centros(X, k=3), iters=10)
        plot_data(X, centros_all, idx)

    plt.show()

2. PCA二维数据的降维

"""
案例1:使用PCA进行二维数据的降维
数据集:data/ex7data1.mat
"""

import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(context="notebook", style="white")

mat = sio.loadmat('./data/ex7data1.mat')
X = mat['X']  # X-->(50,2)
print(X.shape)
# 散点图
plt.figure()
plt.scatter(X[:,0],X[:,1])

# 对X去均值化
X_demean = X - np.mean(X, axis=0)
plt.figure()
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.title('X去均值化的散点图')
plt.scatter(X_demean[:,0],X_demean[:,1])


C = X_demean.T @ X_demean / len(X) # 计算协方差矩阵n×n,n表示特征数量
print(C)
U, S, V = np.linalg.svd(C) # 计算特征值,特征向量。SVD奇异值分解

U_reduce = U[:,0]  # 选取前K向量,获得n×k的矩阵。此处选U的第一列(2,)
z = X_demean @ U_reduce  # 新特征向量z-->(50,1)

plt.figure(figsize=(5,5))
plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
plt.title('数据降维')
plt.scatter(X_demean[:,0], X_demean[:,1])
plt.plot([0,U_reduce[0]], [0,U_reduce[1]], c='r')
plt.plot([0,U[:,1][0]], [0,U[:,1][1]], c='k')

# 还原数据---z为前面PCA压缩的数据,还原回原始的二维空间
X_approx = z.reshape(50,1) @ U_reduce.reshape(1,2) + np.mean(X,axis=0) # X_approx-->(50,2)近似等于原X

plt.figure()
plt.scatter(X[:,0], X[:,1])
plt.scatter(X_approx[:,0], X_approx[:,1])

3. PCA二维数据的降维-01

"""
案例1:使用PCA进行二维数据的降维
数据集:data/ex7data1.mat
"""

import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(context="notebook", style="white")


# support functions ---------------------------------------
def plot_n_image(X, n):
    """ plot first n images
    n has to be a square number
    """
    pic_size = int(np.sqrt(X.shape[1]))
    grid_size = int(np.sqrt(n))

    first_n_images = X[:n, :]

    fig, ax_array = plt.subplots(nrows=grid_size, ncols=grid_size,
                                 sharey=True, sharex=True, figsize=(8, 8))

    for r in range(grid_size):
        for c in range(grid_size):
            ax_array[r, c].imshow(first_n_images[grid_size * r + c].reshape((pic_size, pic_size)))
            plt.xticks(np.array([]))
            plt.yticks(np.array([]))


# PCA functions ---------------------------------------
def covariance_matrix(X):
    """
    Args:
        X (ndarray) (m, n)
    Return:
        cov_mat (ndarray) (n, n):
            covariance matrix of X
    """
    m = X.shape[0]

    return (X.T @ X) / m


def normalize(X):
    """
        for each column, X-mean / std
    """
    X_copy = X.copy()
    m, n = X_copy.shape

    for col in range(n):
        X_copy[:, col] = (X_copy[:, col] - X_copy[:, col].mean()) / X_copy[:, col].std()

    return X_copy


def pca(X):
    """
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html
    Args:
        X ndarray(m, n)
    Returns:
        U ndarray(n, n): principle components
    """
    # 1. normalize data
    X_norm = normalize(X)

    # 2. calculate covariance matrix
    Sigma = covariance_matrix(X_norm)  # (n, n)

    # 3. do singular value decomposition
    # remeber, we feed cov matrix in SVD, since the cov matrix is symmetry
    # left sigular vector and right singular vector is the same, which means
    # U is V, so we could use either one to do dim reduction
    U, S, V = np.linalg.svd(Sigma)  # U: principle components (n, n)

    return U, S, V


def project_data(X, U, k):
    """
    Args:
        U (ndarray) (n, n)
    Return:
        projected X (n dim) at k dim
    """
    m, n = X.shape

    if k > n:
        raise ValueError('k should be lower dimension of n')

    return X @ U[:, :k]


def recover_data(Z, U):
    m, n = Z.shape

    if n >= U.shape[0]:
        raise ValueError('Z dimension is >= U, you should recover from lower dimension to higher')

    return Z @ U[:, :n].T


if __name__ == '__main__':
    mat = sio.loadmat('./data/ex7data1.mat')
    X = mat.get('X')
    X_norm = normalize(X)
    print(X_norm.shape)
    Sigma = covariance_matrix(X_norm)
    print(Sigma)
    U, S, V = pca(X)
    Z = project_data(X_norm, U, 1)  # project data to lower dimension
    print(Z[:10])
    print(Z)
    X_recover = recover_data(Z, U)

    # project data to lower dimension
    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 4))

    sns.regplot('X1', 'X2',
                data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
                fit_reg=False,
                ax=ax1)
    ax1.set_title('Original dimension')

    sns.rugplot(Z, ax=ax2).set(xlim=(-3, 3))
    ax2.set_xlabel('Z')
    ax2.set_title('Z dimension')

    fig, (ax3, ax4, ax5) = plt.subplots(ncols=3, figsize=(16, 4))

    # recover data to original dimension
    sns.rugplot(Z, ax=ax3).set(xlim=(-3, 3))
    ax3.set_title('Z dimension')
    ax3.set_xlabel('Z')

    sns.regplot('X1', 'X2',
                data=pd.DataFrame(X_recover, columns=['X1', 'X2']),
                fit_reg=False,
                ax=ax4)
    ax4.set_title("2D projection from Z")

    sns.regplot('X1', 'X2',
                data=pd.DataFrame(X_norm, columns=['X1', 'X2']),
                fit_reg=False,
                ax=ax5)
    ax5.set_title('Original dimension')
    plt.show()

4. PCA图像降维

"""
案例2:使用PCA进行图像的降维,k=36
数据集:data/ex7faces.mat
具体算法公式--见笔记第14讲数据降维与重建
"""

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt


# support functions ---------------------------------------
def plot_n_image(X, n): # 图像可视化,显示n张图片
    """ plot first n images
    n has to be a square number
    """
    pic_size = int(np.sqrt(X.shape[1]))
    grid_size = int(np.sqrt(n))

    first_n_images = X[:n, :]

    fig, ax_array = plt.subplots(nrows=grid_size, ncols=grid_size,
                                 sharey=True, sharex=True, figsize=(8, 8))

    for r in range(grid_size):
        for c in range(grid_size):
            ax_array[r, c].imshow(first_n_images[grid_size * r + c].reshape((pic_size, pic_size)))
            plt.xticks(np.array([]))
            plt.yticks(np.array([]))


# PCA functions ---------------------------------------
def covariance_matrix(X):
    """
    Args:
        X (ndarray) (m, n)
    Return:
        cov_mat (ndarray) (n, n):
            covariance matrix of X
    """
    m = X.shape[0]

    return (X.T @ X) / m


def normalize(X):
    """
        for each column, X-mean / std
    """
    X_copy = X.copy()
    m, n = X_copy.shape

    for col in range(n):
        X_copy[:, col] = (X_copy[:, col] - X_copy[:, col].mean()) / X_copy[:, col].std()

    return X_copy


def pca(X):
    """
    http://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.svd.html
    Args:
        X ndarray(m, n)
    Returns:
        U ndarray(n, n): principle components
    """
    # 1. normalize data
    X_norm = normalize(X)

    # 2. calculate covariance matrix
    Sigma = covariance_matrix(X_norm)  # (n, n)

    # 3. do singular value decomposition
    # remeber, we feed cov matrix in SVD, since the cov matrix is symmetry
    # left sigular vector and right singular vector is the same, which means
    # U is V, so we could use either one to do dim reduction
    U, S, V = np.linalg.svd(Sigma)  # U: principle components (n, n)

    return U, S, V


def project_data(X, U, k):
    """
    Args:
        U (ndarray) (n, n)
    Return:
        projected X (n dim) at k dim
    """
    m, n = X.shape

    if k > n:
        raise ValueError('k should be lower dimension of n')

    return X @ U[:, :k]


def recover_data(Z, U):
    m, n = Z.shape

    if n >= U.shape[0]:
        raise ValueError('Z dimension is >= U, you should recover from lower dimension to higher')

    return Z @ U[:, :n].T


if __name__ == "__main__":
    mat = sio.loadmat('./data/ex7faces.mat')
    X = np.array([x.reshape((32, 32)).T.reshape(1024) for x in mat.get('X')])  # X-->(5000,1024)

    plot_n_image(X, n=64)
    U, _, _ = pca(X)

    # didn't see face in principle components
    plot_n_image(U, n=32)

    # reduce dimension to k=100
    Z = project_data(X, U, k=100)
    plot_n_image(Z, n=64)

    # recover from k=100
    X_recover = recover_data(Z, U) # 选取的Z的前100列(维)数据,还原结果,和原始图像近似
    plot_n_image(X_recover, n=64)
    plt.show()

# -----------------------sklearn PCA-------------------------------------
    from sklearn.decomposition import PCA
    sk_pca = PCA(n_components=100)
    Z = sk_pca.fit_transform(X)
    plot_n_image(Z, 64)
    plt.show()
    X_recover = sk_pca.inverse_transform(Z)
    plot_n_image(X_recover, n=64)
    plt.show()

5. 图片颜色聚类

"""
案例2: 使用kmeans对图片颜色进行聚类
RGB图像,每个像素点值范围0-255
数据集:data/bird_small.mat,data/bird_small.png
"""

import numpy as np
import scipy.io as sio
import matplotlib.pyplot as plt
from skimage import io


def find_centroids(X, centros):
    idx = []
    for i in range(len(X)):
        # (2,)(k,2)->(k,2)
        dist = np.linalg.norm((X[i] - centros), axis=1)  # (k,)
        id_i = np.argmin(dist)
        idx.append(id_i)
    return np.array(idx)


def compute_centros(X, idx, k):
    centros = []

    for i in range(k):
        centros_i = np.mean(X[idx == i], axis=0)
        centros.append(centros_i)

    return np.array(centros)


def run_kmeans(X, centros, iters):
    k = len(centros)
    centros_all = []
    centros_all.append(centros)
    centros_i = centros
    for i in range(iters):
        idx = find_centroids(X, centros_i)
        centros_i = compute_centros(X, idx, k)
        centros_all.append(centros_i)

    return idx, np.array(centros_all)


def init_centros(X, k):
    index = np.random.choice(len(X), k)
    return X[index]


if __name__ == '__main__':

    data = sio.loadmat('data/bird_small.mat')
    A = data['A']
    print(A.shape)

    image = io.imread('data/bird_small.png')
    fig, ax = plt.subplots(figsize=(8, 6))
    plt.imshow(image)
    plt.axis('off') # 不显示坐标轴


    A = A / 255
    A = A.reshape(-1, 3) # 三维->二维
    print(A.shape)
    k = 16
    idx, centros_all = run_kmeans(A, init_centros(A, k=16), iters=20)
    centros = centros_all[-1] # 迭代最后一次的聚类中心点坐标
    im = np.zeros(A.shape)
    for i in range(k):
        im[idx == i] = centros[i]
    im = im.reshape(128, 128, 3)
    # im = np.rint(im*255).astype('uint8') # 还原为0-255的rgb范围图像
    print(im)
    fig, ax = plt.subplots(figsize=(8, 6))
    plt.imshow(im)
    plt.show()
  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值