Machine Learning-Ex7（吴恩达课后习题）K-means Clustering and Principal Component Analysis

MyDreamingCode

于 2023-05-04 20:28:17 发布

阅读量513

点赞数

分类专栏：机器学习文章标签：机器学习 kmeans 算法

本文链接：https://blog.csdn.net/qq_61706112/article/details/130410962

版权

机器学习专栏收录该内容

9 篇文章 2 订阅

订阅专栏

1. K-means Clustering

1.1 Implementing K-means

1.1.1 Finding closest centroids

1.1.2 Computing centroid means

1.2 K-means on example dataset

1.3 Random initialization

1.4 Image compression with K-means

1.4.1 K-means on pixels

1.4.2 Use scikit-learn to realize K-means algorithm

2. Principal Component Analysis

2.1 Example Dataset

2.2 Implementing PCA

2.3 Dimensionality Reduction with PCA

2.3.1 Projecting the data onto the principal components

2.3.2 Reconstructing an approximation of the data

2.3.3 Visualizing the projections

2.4 Face Image Dataset

2.4.1 PCA on Faces

1. K-means Clustering

内容：我们先将K-means算法运用到二维数据集上，之后我们再将它运用到图片压缩上。

1.1 Implementing K-means

K-means：

1. Iteration

step1：将样本分配给离它最近的聚类中心（cluster assignment step）；

step2：将聚类中心移动到它所拥有的点的平均位置。（move centroid）。

2. Initialization

原因：可能会进入局部最优。

方法：多次进行初始化参数，运行K-means，计算其代价函数这一过程。找出最小的代价函数值，即为那一种聚类分法。

1.1.1 Finding closest centroids

内容：将每个样本分配给离它们最近的聚类中心，并设定：

$\LARGE c^{(i)}:=j\; that\, minimizes\, \left \| x^{(i)}-\mu_{j} \right \|^{2}$

输出一维的关于每个样本分配给了最近的聚类中心（idx）的向量。

findClosestCentroids.py

import numpy as np

def findClosestCentroids(X, centroids):
    m = X.shape[0]
    K = centroids.shape[0]
    idx = np.zeros(m)
    for i in range(m):
        min_dist = 1000000
        for j in range(K):
            dist = np.sum((X[i, :] - centroids[j, :]) ** 2)
            if dist < min_dist:
                min_dist = dist
                idx[i] = j
    return idx

main.py

from scipy.io import loadmat
import numpy as np
from findClosestCentroids import *  # 找到最近的聚类中心

raw_data = loadmat('ex7data1.mat')
X = raw_data['X']
initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
idx = findClosestCentroids(X, initial_centroids)
print(X[:8])
print(idx[:8])

[[3.38156267 3.38911268]
[4.52787538 5.8541781 ]
[2.65568187 4.41199472]
[2.76523467 3.71541365]
[2.84656011 4.17550645]
[3.89067196 6.48838087]
[3.47580524 3.63284876]
[5.91129845 6.68076853]]
[0. 0. 0. 0. 0. 0. 0. 2.]

main.py

from scipy.io import loadmat
import pandas as pd

raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
data = pd.DataFrame(X, columns=['X1', 'X2'])
print(data.head())

X1 X2
0 1.842080 4.607572
1 5.658583 4.799964
2 6.352579 3.290854
3 2.904017 4.612204
4 3.231979 4.939894

数据可视化

plot.py

def plotData(ax, X):
    ax.scatter(X['X1'], X['X2'])
    ax.set_xlabel('X1')
    ax.set_ylabel('X2')

main.py

from scipy.io import loadmat
import pandas as pd
import matplotlib.pyplot as plt
from plot import *  # 绘图

raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
data = pd.DataFrame(X, columns=['X1', 'X2'])
fig, ax = plt.subplots(figsize=(8, 6))
plotData(ax, data)
plt.show()

1.1.2 Computing centroid means

内容：计算聚类中心。聚类中心重新赋值为分配给该聚类中心的所有点的平均值，即：

$\mu_{k}:=\frac{1}{\left | C_{k} \right |}\sum_{i\epsilon C_{k}}x^{(i)}$

np.where和索引用法例子

test.py

import numpy as np

idx = np.array([0, 0, 1, 2, 1, 2, 2, 0, 1])
for i in range(3):
    indices = np.where(idx == i)
    print(indices)

(array([0, 1, 7], dtype=int64),)
(array([2, 4, 8], dtype=int64),)
(array([3, 5, 6], dtype=int64),)

computeCentroids.py

import numpy as np

def computeCentroids(X, idx, k):
    r, c = X.shape
    centroids = np.zeros((k, c))
    for i in range(k):
        indices = np.where(idx == i)
        # axis=1：列求和
        centroids[i, :] = np.sum(X[indices, :], axis=1) / len(indices[0])
    return centroids

main.py

from scipy.io import loadmat
import numpy as np
from findClosestCentroids import *  # 找到最近的聚类中心
from computeCentroids import *  # 计算聚类中心

raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
idx = findClosestCentroids(X, initial_centroids)
re_centroids = computeCentroids(X, idx, 3)
print(re_centroids)

[[2.42830111 3.15792418]
[5.81350331 2.63365645]
[7.11938687 3.6166844 ]]

1.2 K-means on example dataset

内容：实现K-means算法，将前面两个步骤放入循环中。

runKMeans.py

import numpy as np
from findClosestCentroids import *  # 找到最近的聚类中心
from computeCentroids import *  # 计算聚类中心

def runKMeans(X, initial_centroids, max_iters):
    r, c = X.shape
    k = initial_centroids.shape[0]
    idx = np.zeros(r)
    centroids = initial_centroids
    for i in range(max_iters):
        idx = findClosestCentroids(X, centroids)
        centroids = computeCentroids(X, idx, k)
    return idx, centroids

plot.py

import numpy as np

def plotData(ax, X):
    ax.scatter(X['X1'], X['X2'])
    ax.set_xlabel('X1')
    ax.set_ylabel('X2')

def plotClusteringData(ax, X, idx, centroids):
    k = centroids.shape[0]
    color = ['r', 'g', 'b']
    for i in range(k):
        cluster_i = X[np.where(idx == i)[0], :]
        ax.scatter(cluster_i[:, 0], cluster_i[:, 1], s=10, c=color[i], label='Cluster{}'.format(i + 1))
        ax.scatter(centroids[i, 0], centroids[i, 1], s=60, marker='x', c=color[
            i], label='cluster centroid{}'.format(
            i + 1))
        ax.legend()

main.py

from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
from runKMeans import *  # 运行K-means算法
from plot import *  # 绘制聚类好的数据

raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
max_iters = 10
idx, centroids = runKMeans(X, initial_centroids, max_iters)
fig, ax = plt.subplots(figsize=(8, 6))
plotClusteringData(ax, X, idx, centroids)
plt.show()

1.3 Random initialization

内容：初始化聚类中心，可以将数据打乱，再选择K个点作为聚类中心。

kMeansInitCentroids.py

import numpy as np

def kMeansInitCentroids(X, K):
    r, c = X.shape
    centroids = np.zeros((K, c))
    idx = np.random.randint(0, r, K)
    # random.randint(low,high,size)-从[low,high)中返回数量为size的随机数
    for i in range(K):
        centroids[i, :] = X[idx[i], :]
    return centroids

main.py（将迭代次数设置为1以便看到初始化聚类中心的位置）

from scipy.io import loadmat
import matplotlib.pyplot as plt
from kMeansInitCentroids import *  # 初始化聚类中心
from runKMeans import *  # 运行K-means算法
from plot import *  # 绘制聚类好的数据

raw_data = loadmat('ex7data2.mat')
X = raw_data['X']
K = 3
initial_centroids = kMeansInitCentroids(X, K)
max_iters = 1
idx, centroids = runKMeans(X, initial_centroids, max_iters)
fig, ax = plt.subplots(figsize=(8, 6))
plotClusteringData(ax, X, idx, centroids)
plt.show()

1.4 Image compression with K-means

内容：原图像是由24bit的颜色呈现的，每个像素是由3个8bit的无符号整数（0-255）分表表示红、绿，蓝的强度值。在这张包含上千种颜色的图像里使用K-means算法，最后只使用16种颜色将图像呈现出来。

1.4.1 K-means on pixels

原图展现

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# 1.mpimg.imread()用于从图像文件中读取图像数据
# 2.imshow(X)用于绘制图像,X-输入数据(数组等)
data_img = mpimg.imread('bird_small.png')
draw_img = plt.imshow(data_img)
plt.show()

图像数据

main.py

from scipy.io import loadmat

raw_data = loadmat('bird_small.mat')
image_data = raw_data['A']
print(image_data)
print(image_data.shape)

[[[219 180 103]
[230 185 116]
[226 186 110]
...
[ 14 15 13]
[ 13 15 12]
[ 12 14 12]]

[[230 193 119]
[224 192 120]
[226 192 124]
...
[ 16 16 13]
[ 14 15 10]
[ 11 14 9]]

[[228 191 123]
[228 191 121]
[220 185 118]
...
[ 14 16 13]
[ 13 13 11]
[ 11 15 10]]

...

[[ 15 18 16]
[ 18 21 18]
[ 18 19 16]
...
[ 81 45 45]
[ 70 43 35]
[ 72 51 43]]

[[ 16 17 17]
[ 17 18 19]
[ 20 19 20]
...
[ 80 38 40]
[ 68 39 40]
[ 59 43 42]]

[[ 15 19 19]
[ 20 20 18]
[ 18 19 17]
...
[ 65 43 39]
[ 58 37 38]
[ 52 39 34]]]
(128, 128, 3)

原始数据的处理

from scipy.io import loadmat
import numpy as np

raw_data = loadmat('bird_small.mat')
image_data = raw_data['A']  # 像素值用rgb来表示
image_data = image_data / 100  # normalize value ranges
X = np.reshape(image_data, (image_data.shape[0] * image_data.shape[1], image_data.shape[2]))
print(X.shape)  # (16384, 3)

main.py

from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from kMeansInitCentroids import *  # 初始化聚类中心
from runKMeans import *  # 运行K-means算法

raw_data = loadmat('bird_small.mat')
image_data = raw_data['A']  # 像素值用rgb来表示
image_data = image_data / 100  # normalize value ranges
X = np.reshape(image_data, (image_data.shape[0] * image_data.shape[1], image_data.shape[2]))
K = 16
max_iters = 10
# 1.randomly initialize centroids
initialize_centroids = kMeansInitCentroids(X, K)
# 2.run K-means algorithm
idx, centroids = runKMeans(X, initialize_centroids, max_iters)
# 3.get the closest centroids one last time
idx = findClosestCentroids(X, centroids)
# 4.map each pixel to the centroid value
# astype()用来转换numpy数组的数据类型
X_pixel = centroids[idx.astype(int), :]
X_pixel = np.reshape(X_pixel, (image_data.shape[0], image_data.shape[1], image_data.shape[2]))
# print(X_pixel.shape) #(128, 128, 3)
plt.imshow(X_pixel)
plt.show()

1.4.2 Use scikit-learn to realize K-means algorithm

A. 显示图像

main.py

import matplotlib.pyplot as plt
from skimage.io import imread, imshow

img_data = imread('bird_small.png')  # 1.读取图像
imshow(img_data)  # 2.绘制图像
plt.show()  # 3.显示图像

B. 进行压缩

main.py

import matplotlib.pyplot as plt
from skimage.io import imread, imshow
from sklearn.cluster import KMeans  # sklearn机器学习库

img_data = imread('bird_small.png') / 255  # 1.读取图像
data = img_data.reshape(img_data.shape[0] * img_data.shape[1], img_data.shape[2])
# print(data.shape)  # (16384, 3)

# KMeans的主要参数
# n_clusters:K值。
# n_init:不同初始化聚类中心的次数。
model = KMeans(n_clusters=16, n_init=100)  # 创建分类器对象
model.fit(data)  # 用训练集拟合分类器模型
centroids = model.cluster_centers_  # 输出聚类中心
idx = model.predict(data)  # 预测训练集中每个实例的类 [1 1 1 ... 8 8 8]
compressed_data = centroids[idx, :].reshape(img_data.shape)
# print(compressed_data.shape)  # (128, 128, 3)
fig, ax = plt.subplots(1, 2)
ax[0].imshow(img_data)  # 绘制图像
ax[1].imshow(compressed_data)
plt.show()

2. Principal Component Analysis

内容：先在2维数据中使用PCA，再在人脸图像数据中进行PCA降维，尽可能保留的方差最大（数据信息量最大）。

2.1 Example Dataset

内容：数据可视化

main.py

from scipy.io import loadmat
import matplotlib.pyplot as plt

X = loadmat('ex7data1.mat')['X']
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(X[:, 0], X[:, 1])
plt.show()

2.2 Implementing PCA

Step1: 计算协方差矩阵（先进行特征标准化）；

Step2: 计算协方差矩阵的特征向量和特征值以识别主成分（使用SVD奇异值分解）。

协方差矩阵： $\sum =\frac{1}{m}X^{T}X$

PCA.py

import numpy as np

def PCA(X):
    m = X.shape[0]
    X = (X - X.mean()) / X.std()
    X = np.matrix(X)
    Sigmoid = (X.T * X) / m  # 计算协方差矩阵
    U, S, V = np.linalg.svd(Sigmoid)
    return U, S, V

main.py

from scipy.io import loadmat
from PCA import *  # 运行PCA算法

X = loadmat('ex7data1.mat')['X']
U, S, V = PCA(X)  # U为主成分
print(U, S, V)

[[-0.79241747 -0.60997914]
[-0.60997914 0.79241747]] [1.43584536 0.56415464] [[-0.79241747 -0.60997914]
[-0.60997914 0.79241747]]

2.3 Dimensionality Reduction with PCA

内容：使用U矩阵（主成分），将原始数据投影到低维的空间里。

2.3.1 Projecting the data onto the principal components

内容：投影数据。

projectData.py

def projectData(X, U, K):
    U_reduce = U[:, :K]
    # X(50, 2),U_reduce(2,1)
    # print((X * U_reduce).shape)  # (50,1)
    return X * U_reduce

main.py

from scipy.io import loadmat
from PCA import *  # 运行PCA算法
from projectData import *  # 投影到低维空间

X = loadmat('ex7data1.mat')['X']
U, S, V = PCA(X)  # U为主成分
Z = projectData(X, U, 1)
print(Z)

[[-4.74689738]
[-7.15889408]
[-4.79563345]
[-4.45754509]
[-4.80263579]
[-7.04081342]
[-4.97025076]
[-8.75934561]
[-6.2232703 ]
[-7.04497331]
[-6.91702866]
[-6.79543508]
[-6.3438312 ]
[-6.99891495]
[-4.54558119]
[-8.31574426]
[-7.16920841]
[-5.08083842]
[-8.54077427]
[-6.94102769]
[-8.5978815 ]
[-5.76620067]
[-8.2020797 ]
[-6.23890078]
[-4.37943868]
[-5.56947441]
[-7.53865023]
[-7.70645413]
[-5.17158343]
[-6.19268884]
[-6.24385246]
[-8.02715303]
[-4.81235176]
[-7.07993347]
[-5.45953289]
[-7.60014707]
[-4.39612191]
[-7.82288033]
[-3.40498213]
[-6.54290343]
[-7.17879573]
[-5.22572421]
[-4.83081168]
[-7.23907851]
[-4.36164051]
[-6.44590096]
[-2.69118076]
[-4.61386195]
[-5.88236227]
[-7.76732508]]

2.3.2 Reconstructing an approximation of the data

内容：压缩重现，恢复原始数据。

restore.py

def Restore(Z, U, K):
    U_reduce = U[:, :K]
    # Z(50,1) U_reduce(2,1)
    # print((Z * U_reduce.T).shape) #(50,2)
    return Z * U_reduce.T

main.py

from scipy.io import loadmat
from PCA import *  # 运行PCA算法
from projectData import *  # 投影到低维空间
from restore import *  # 压缩重现

X = loadmat('ex7data1.mat')['X']
U, S, V = PCA(X)  # U为主成分
Z = projectData(X, U, 1)
X_recovered = Restore(Z, U, 1)
print(X_recovered)

[[3.76152442 2.89550838]
[5.67283275 4.36677606]
[3.80014373 2.92523637]
[3.53223661 2.71900952]
[3.80569251 2.92950765]
[5.57926356 4.29474931]
[3.93851354 3.03174929]
[6.94105849 5.3430181 ]
[4.93142811 3.79606507]
[5.58255993 4.29728676]
[5.48117436 4.21924319]
[5.38482148 4.14507365]
[5.02696267 3.8696047 ]
[5.54606249 4.26919213]
[3.60199795 2.77270971]
[6.58954104 5.07243054]
[5.681006 4.37306758]
[4.02614513 3.09920545]
[6.76785875 5.20969415]
[5.50019161 4.2338821 ]
[6.81311151 5.24452836]
[4.56923815 3.51726213]
[6.49947125 5.00309752]
[4.94381398 3.80559934]
[3.47034372 2.67136624]
[4.41334883 3.39726321]
[5.97375815 4.59841938]
[6.10672889 4.70077626]
[4.09805306 3.15455801]
[4.90719483 3.77741101]
[4.94773778 3.80861976]
[6.36085631 4.8963959 ]
[3.81339161 2.93543419]
[5.61026298 4.31861173]
[4.32622924 3.33020118]
[6.02248932 4.63593118]
[3.48356381 2.68154267]
[6.19898705 4.77179382]
[2.69816733 2.07696807]
[5.18471099 3.99103461]
[5.68860316 4.37891565]
[4.14095516 3.18758276]
[3.82801958 2.94669436]
[5.73637229 4.41568689]
[3.45624014 2.66050973]
[5.10784454 3.93186513]
[2.13253865 1.64156413]
[3.65610482 2.81435955]
[4.66128664 3.58811828]
[6.1549641 4.73790627]]

2.3.3 Visualizing the projections

main.py（绘图）

from scipy.io import loadmat
import matplotlib.pyplot as plt
from PCA import *  # 运行PCA算法
from projectData import *  # 投影到低维空间
from restore import *  # 压缩重现

X = loadmat('ex7data1.mat')['X']
U, S, V = PCA(X)  # U为主成分
Z = projectData(X, U, 1)
X_recovered = Restore(Z, U, 1)

fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(X[:, 0], X[:, 1], c='b', label='original data')
ax.scatter(list(X_recovered[:, 0]), list(X_recovered[:, 1]), c='r', label='projected data')
ax.legend()
plt.show()

2.4 Face Image Dataset

内容：在图像上运用PCA进行降维。

数据：X中每一行代表一个人脸图像，每一行向量中的长度为1024。

main.py

from scipy.io import loadmat

raw_data = loadmat('ex7faces.mat')['X']
print(raw_data.shape)  # (5000, 1024)

显示图像

plotImage.py

import matplotlib.pyplot as plt
from skimage.io import imshow
import numpy as np

def plotNumImage(X, N):
    pic_size = int(np.sqrt(X.shape[1]))  # 将图像在之后用矩阵表示
    grid_size = int(np.sqrt(X.shape[0]))  # 绘图时用到
    firstNImage = X[:N, :]
    fig, axs = plt.subplots(nrows=grid_size, ncols=grid_size, sharey=True, sharex=True, figsize=(10, 10))
    for r in range(grid_size):
        for c in range(grid_size):
            axs[r][c].imshow((firstNImage[r * grid_size + c, :]).reshape(pic_size, pic_size))
    plt.xticks(np.array([]))
    plt.yticks(np.array([]))

在这里我们先只显示一个图像

main.py

from scipy.io import loadmat
from skimage.io import imshow
import matplotlib.pyplot as plt
import numpy as np

raw_data = loadmat('ex7faces.mat')['X']
n = int(np.sqrt(raw_data.shape[1]))
face = np.reshape(raw_data[1, :], (n, n))
imshow(face)
plt.show()

2.4.1 PCA on Faces

内容：使用2.3节的步骤，进行数据降维。

main.py

from scipy.io import loadmat
from skimage.io import imshow
import matplotlib.pyplot as plt
import numpy as np
from PCA import *  # 运行PCA
from projectData import *  # 进行投影
from restore import *  # 压缩重现

X = loadmat('ex7faces.mat')['X']
n = int(np.sqrt(X.shape[1]))
U, S, V = PCA(X)
Z = projectData(X, U, 100)  # 降成100维
# 压缩重现
X_recovered = Restore(Z, U, 100)
# print(X_recovered.shape)  # (5000, 1024)
face_recovered = np.array(np.reshape(X_recovered[1, :], (n, n)))
imshow(face_recovered)
plt.show()