import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
data = pd.read_csv('iris.csv')
x = data.drop(['target', 'label'], axis = 1)
y = data.loc[:, 'label']
# 建立KNN模型,训练
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(x, y)
y_predict = KNN.predict(x)
accuracy = accuracy_score(y, y_predict)
# print(accuracy) # 0.959731543624161
x_norm = StandardScaler().fit_transform(x) # 标准化处理
# 使新数据均值是0,标准差是1
# print(x_norm)
# 可视化标准化前后结果
fig1 = plt.figure(figsize = (10, 10))
plt.subplot(121)
plt.hist(x.loc[:, 'sepal_length'], bins = 100)
plt.subplot(122)
plt.hist(x_norm[:, 0], bins = 100)
# plt.show()
# 确认均值和标准差是否是0,1
x1_mean = x.loc[:, 'sepal_length'].mean()
x1_norm_mean = x_norm[:, 0].mean()
x1_sigma = x.loc[:, 'sepal_length'].std()
x1_norm_sigma = x_norm[:, 0].std()
# print(x1_mean, x1_norm_mean, x1_sigma, x1_norm_sigma)
# ^十分接近0,1
# PCA主成分分析
pca = PCA(n_components=4)
x_pca = pca.fit_transform(x_norm)
# 方差比
var_ratio = pca.explained_variance_ratio_ # [0.72620033 0.23147407 0.03711516 0.00521044]
# print(var_ratio)
# 可视化方差比例
fig2 = plt.figure(figsize=(20, 5))
plt.bar([1,2,3,4], var_ratio)
plt.xticks([1,2,3,4], ['PC1', 'PC2', 'PC3', 'PC4'])
plt.ylabel('variance ratio of each principlt components')
# 主成分分析(舍弃后两项)
pca = PCA(n_components=2)
x_pca = pca.fit_transform(x_norm)
# 可视化主成分分析结果
fig3 = plt.figure()
setosa = plt.scatter(x_pca[:, 0][y==0], x_pca[:, 1][y==0])
versicolor = plt.scatter(x_pca[:, 0][y==1], x_pca[:, 1][y==1])
virginica = plt.scatter(x_pca[:, 0][y==2], x_pca[:, 1][y==2])
plt.legend((setosa, versicolor, virginica), ('setosa', 'versicolor', 'verginica'))
plt.show()
# 再次建立KNN模型
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(x_pca, y)
y_predict = KNN.predict(x_pca)
accuracy = accuracy_score(y, y_predict)
print(accuracy) # 0.959731543624161vs.0.9463087248322147
结果图:
数据集:iris.csv
如何将.data转换为.csv