import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
%matplotlib inline
# 列名
cols = [
'sepal length in cm',
'sepal width in cm',
'petal length in cm',
'petal width in cm',
'class label'
]
df = pd.read_csv(r"F:\OneDrive\机器学习练习笔记\data\iris.data", names=cols)
X = df[:, 0:4].values
# 标签映射
# from sklearn.preprocessing import LabelEncoder
# enc = LabelEncoder()
# y = enc.fit_transform(y) + 1
mapping = {"Iris-setosa": 1, "Iris-versicolor": 2, "Iris-virginica": 3}
y = df[:, 4].map(mapping).values
原始数据可视化
plt.figure(figsize=(8, 6))
for cnt in range(4):
plt.subplot(2, 2, cnt+1)
for k, v in mapping.items():
plt.hist(X[y==v, cnt], label=k, alpha=0.3, bins=10)
plt.xlabel(cols[cnt])
plt.legend(loc='upper right', fontsize=8, fancybox=True, framealpha=0.3)
plt.tight_layout()
plt.show()
# 标准化处理
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
# 选前两个特征值降维
matrix_w = np.hstack((eig_pairs[0][1].reshape(-1, 1), eig_pairs[1][1].reshape(-1, 1)))
Y = X_std.dot(matrix_w)
### 可视化
plt.figure(figsize=(6, 4))
for (lab, lab_map), col in zip(mapping.items(), ('b', 'r', 'g')):
plt.scatter(Y[y==lab_map, 0], Y[y==lab_map, 1], label=lab, c=col)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(loc='lower center')
plt.tight_layout()
plt.show()