PCA与LDA降维前后分类精度比较
1. 题目
用sklearn.datasets 里的load_digits(手写数字1797张),先对数据集进行可视化,接着对比并可视化PCA、LDA的降维后用KNN进行分类的效果。
2. 实验结果
3. 代码实现
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import adjusted_rand_score
import math
import numpy as np
#聚类精度模板、三大指标模板
from sklearn import metrics
from scipy.optimize import linear_sum_assignment
def cluster_acc(y_true, y_pred):
y_true = np.array(y_true).astype(np.int64)
assert y_pred.size == y_true.size
D = max(y_pred.max(), y_true.max()) + 1
w = np.zeros((D, D), dtype=np.int64)
for i in range(y_pred.size):
w[y_pred[i], y_true[i]] += 1
ind = linear_sum_assignment(w.max() - w)
ind = np.asarray(ind)
ind = np.transpose(ind)
return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size
def clusteringMetrics(trueLabel, predictiveLabel):
# Clustering accuracy
ACC = cluster_acc(trueLabel, predictiveLabel)
# Normalized mutual information
NMI = metrics.normalized_mutual_info_score(trueLabel, predictiveLabel)
# Adjusted rand index
ARI = metrics.adjusted_rand_score(trueLabel, predictiveLabel)
return ACC, NMI, ARI
# 加载 `digits` 数据集
digits = load_digits()
X = digits.data
labels = digits.target
X_train, X_test, y_train, y_test = \
train_test_split(X, labels, test_size=0.2, random_state=22)
# 设置图形大小(宽、高)以英寸为单位
fig = plt.figure(figsize=(6, 6))
# 设置子图形布局,如间隔之类...
fig.subplots_adjust(bottom=0,top=0.4,wspace =0, hspace =0)
fig.tight_layout()
# 对于64幅图像中的每一幅
for i in range(200):
# 初始化子图:在10×20的网格中,在第i+1个位置添加一个子图
ax = fig.add_subplot(10, 20, i + 1, xticks=[], yticks=[])
# 在第i个位置显示图像
ax.imshow(digits.images[i])
A_PCA = []
A_LDA = []
for i in range(1, 10):
# PCA + KNN
pca = PCA(n_components=i).fit(X_train) # pca模型训练
# 将输入数据投影到特征面正交基上
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
knn = KNeighborsClassifier()
knn.fit(X_train_pca, y_train)
y_sample = knn.predict(X_test_pca)
ACC_PCA= cluster_acc(y_test, y_sample)
A_PCA.append(ACC_PCA)
# LDA + KNN
lda = LinearDiscriminantAnalysis(n_components=i).fit(X_train, y_train) # lda模型训练
#n_components:即我们进行LDA降维时降到的维数。在降维时需要输入这个参数。
# 注意只能为[1,类别数-1)范围之间的整数。如果我们不是用于降维,则这个值可以用默认的None。
# 将输入数据投影到特征面正交基上
X_train_lda = lda.transform(X_train)
X_test_lda = lda.transform(X_test)
knn = KNeighborsClassifier()
knn.fit(X_train_lda, y_train)
y_sample = knn.predict(X_test_lda)
ACC_LDA= cluster_acc(y_test, y_sample)
#ACC_LDA, y,z=clusteringMetrics(y_test, y_sample)
A_LDA.append(ACC_LDA)
# 画柱状图
fig, ax = plt.subplots()
bar_width = 0.35
opacity = 0.6 # 不透明度
index = np.arange(9)
ax.set_xticks(index + bar_width / 2)
cylinder1 = ax.bar(index, A_PCA, bar_width,\
alpha=opacity, color='b', label='PCA')
cylinder2 = ax.bar(index + bar_width, A_LDA, bar_width, \
alpha=opacity, color='g', label='LDA')
label = [] # 横坐标标签
for j in range(1, 10):
label.append(j)
ax.set_xticklabels(label)
ax.legend() # 图例标签
# 显示图形
plt.show()