# KNN for iris data set with cross validation
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import train_test_split,cross_val_score
import matplotlib.pyplot as plt
import numpy as np
# Load Dataset:
feature, flowerC = load_iris(return_X_y=True)
print(feature)
print(flowerC)
### -------------------------------- Data reduction --------------------------------
# PCA to reduce it feature from 4 to 2 dimensions
pca = PCA(n_components=2)
feature_reduced = pca.fit_transform(feature)
# print(feature_reduced)
# print("Singular values are:", pca.singular_values_)
# print("Ratio of explained variance are:", pca.explained_variance_ratio_)
plt.figure(figsize=(10, 6))
labels=['setosa', 'versicolor', 'virginica']
s = plt.scatter(feature_reduced[:, 0], feature_reduced[:, 1],
c=flowerC, marker='o', cmap='rainbow', alpha = 0.6)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.legend(handles = s.legend_elements()[0], labels=labels)
plt.title('Iris')
plt.show()
### ---------------------------- Split data into Train/test sets ----------------------
test_size = 0.2
train_feature,test_feature, train_flowerC, test_flowerC = train_test_split(
feature_reduced, flowerC,test_size=test_size,random_state=3)
### ------------------------------ Find the best choice of K for KNN --------------------------
# Use Training set !!!
# KNN (k from 1 to 30) with 6-fold cross validation
K = 30
cv = 6 #把训练集分成6份,进行六次,我们一般1~5次
k_range = range(1, K+1)
k_error = []
# iterations for k from 1 to 30
for k in k_range:
knn = KNN(n_neighbors=k)
# 6-fold cross validation ,cv=cv = 6 分成六份,取100 来训练模型,数据
scores = cross_val_score(knn, train_feature, train_flowerC, cv=cv, scoring='accuracy')
# print('Accuracy for each fold of cross validation', scores)
k_error.append(1- scores.mean())
# print('Test error for k =' + str(k) + ':', 1 - scores.mean())
k_best = k_error.index(min(k_error))+1
print ('----------------------------------')
print ('The best k for KNN is :', k_best)
plt.figure(figsize=(10, 6))
plt.plot(k_range, k_error, 'g*-', alpha = 0.6)
plt.annotate('Best K', xy=(k_best, min(k_error)), xytext=(k_best+2, min(k_error)*1.2),
fontsize= 12, color= 'red', arrowprops=dict(color='black', shrink=0.04))
plt.title('Test error for KNN with 6-fold cross validation')
plt.xlabel('Number of K nearest neighbors')
plt.ylabel('Test error')
plt.show()
### ------------------------ Test KNN with best K ---------------------------------
# Use training set to train knn, and evaluate knn on test set !!!
best_knn = KNN(n_neighbors=k_best)
best_knn.fit(train_feature, train_flowerC) # train knn fit=train
test_predictC = best_knn.predict(test_feature) # predict unknown features to get their classes
# print(best_knn.predict_proba(test_feature)) # predict probability
# plot the KNN training set and test results
plt.figure(figsize=(10, 6))
labels=['setosa', 'versicolor', 'virginica']
s = plt.scatter(feature_reduced[:, 0], feature_reduced[:, 1],
c=flowerC, s=220, marker='o', cmap='rainbow', alpha = 0.3)
s = plt.scatter(train_feature[:, 0], train_feature[:, 1],
c=train_flowerC, marker='^', cmap='rainbow', alpha = 1)
s = plt.scatter(test_feature[:, 0], test_feature[:, 1],
c=test_predictC, marker='*', cmap='rainbow', alpha = 1)
plt.xlabel('pc1')
plt.ylabel('pc2')
plt.legend(handles = s.legend_elements()[0], labels=labels)
plt.title('Classifcation results with KNN')
plt.show()